4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
48 compat_socket_create_connection,
54 compat_urllib_parse_urlencode,
55 compat_urllib_parse_urlparse,
56 compat_urllib_parse_unquote_plus,
57 compat_urllib_request,
68 def register_socks_protocols():
69 # "Register" SOCKS protocols
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
77 # This is not clearly defined otherwise
78 compiled_regex_type = type(re.compile(''))
81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
96 ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
101 'en': ENGLISH_MONTH_NAMES,
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
120 'f4f', 'f4m', 'm3u8', 'smil')
122 # needed for sanitizing filenames in restricted mode
123 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
148 '%Y-%m-%d %H:%M:%S.%f',
151 '%Y-%m-%dT%H:%M:%SZ',
152 '%Y-%m-%dT%H:%M:%S.%fZ',
153 '%Y-%m-%dT%H:%M:%S.%f0Z',
155 '%Y-%m-%dT%H:%M:%S.%f',
158 '%b %d %Y at %H:%M:%S',
161 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
162 DATE_FORMATS_DAY_FIRST.extend([
171 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
172 DATE_FORMATS_MONTH_FIRST.extend([
180 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
183 def preferredencoding():
184 """Get preferred encoding.
186 Returns the best encoding scheme for the system, based on
187 locale.getpreferredencoding() and some further tweaks.
190 pref = locale.getpreferredencoding()
198 def write_json_file(obj, fn):
199 """ Encode obj as JSON and write it to fn, atomically if possible """
201 fn = encodeFilename(fn)
202 if sys.version_info < (3, 0) and sys.platform != 'win32':
203 encoding = get_filesystem_encoding()
204 # os.path.basename returns a bytes object, but NamedTemporaryFile
205 # will fail if the filename contains non ascii characters unless we
206 # use a unicode object
207 path_basename = lambda f: os.path.basename(fn).decode(encoding)
208 # the same for os.path.dirname
209 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
211 path_basename = os.path.basename
212 path_dirname = os.path.dirname
216 'prefix': path_basename(fn) + '.',
217 'dir': path_dirname(fn),
221 # In Python 2.x, json.dump expects a bytestream.
222 # In Python 3.x, it writes to a character stream
223 if sys.version_info < (3, 0):
231 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
236 if sys.platform == 'win32':
237 # Need to remove existing file on Windows, else os.rename raises
238 # WindowsError or FileExistsError.
243 os.rename(tf.name, fn)
252 if sys.version_info >= (2, 7):
253 def find_xpath_attr(node, xpath, key, val=None):
254 """ Find the xpath xpath[@key=val] """
255 assert re.match(r'^[a-zA-Z_-]+$', key)
256 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
257 return node.find(expr)
259 def find_xpath_attr(node, xpath, key, val=None):
260 for f in node.findall(compat_xpath(xpath)):
261 if key not in f.attrib:
263 if val is None or f.attrib.get(key) == val:
267 # On python2.6 the xml.etree.ElementTree.Element methods don't support
268 # the namespace parameter
271 def xpath_with_ns(path, ns_map):
272 components = [c.split(':') for c in path.split('/')]
276 replaced.append(c[0])
279 replaced.append('{%s}%s' % (ns_map[ns], tag))
280 return '/'.join(replaced)
283 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
284 def _find_xpath(xpath):
285 return node.find(compat_xpath(xpath))
287 if isinstance(xpath, (str, compat_str)):
288 n = _find_xpath(xpath)
296 if default is not NO_DEFAULT:
299 name = xpath if name is None else name
300 raise ExtractorError('Could not find XML element %s' % name)
306 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
307 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
308 if n is None or n == default:
311 if default is not NO_DEFAULT:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element\'s text %s' % name)
321 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
322 n = find_xpath_attr(node, xpath, key)
324 if default is not NO_DEFAULT:
327 name = '%s[@%s]' % (xpath, key) if name is None else name
328 raise ExtractorError('Could not find XML attribute %s' % name)
334 def get_element_by_id(id, html):
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute('id', id, html)
339 def get_element_by_class(class_name, html):
340 """Return the content of the first tag with the specified class in the passed HTML document"""
341 retval = get_elements_by_class(class_name, html)
342 return retval[0] if retval else None
345 def get_element_by_attribute(attribute, value, html, escape_value=True):
346 retval = get_elements_by_attribute(attribute, value, html, escape_value)
347 return retval[0] if retval else None
350 def get_elements_by_class(class_name, html):
351 """Return the content of all tags with the specified class in the passed HTML document as a list"""
352 return get_elements_by_attribute(
353 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
354 html, escape_value=False)
357 def get_elements_by_attribute(attribute, value, html, escape_value=True):
358 """Return the content of the tag with the specified attribute in the passed HTML document"""
360 value = re.escape(value) if escape_value else value
363 for m in re.finditer(r'''(?xs)
365 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
367 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
371 ''' % (re.escape(attribute), value), html):
372 res = m.group('content')
374 if res.startswith('"') or res.startswith("'"):
377 retlist.append(unescapeHTML(res))
382 class HTMLAttributeParser(compat_HTMLParser):
383 """Trivial HTML parser to gather the attributes for a single element"""
386 compat_HTMLParser.__init__(self)
388 def handle_starttag(self, tag, attrs):
389 self.attrs = dict(attrs)
392 def extract_attributes(html_element):
393 """Given a string for an HTML element such as
395 a="foo" B="bar" c="&98;az" d=boz
396 empty= noval entity="&"
399 Decode and return a dictionary of attributes.
401 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
402 'empty': '', 'noval': None, 'entity': '&',
403 'sq': '"', 'dq': '\''
405 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
406 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
408 parser = HTMLAttributeParser()
409 parser.feed(html_element)
414 def clean_html(html):
415 """Clean an HTML snippet into a readable string"""
417 if html is None: # Convenience for sanitizing descriptions etc.
421 html = html.replace('\n', ' ')
422 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
423 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
425 html = re.sub('<.*?>', '', html)
426 # Replace html entities
427 html = unescapeHTML(html)
431 def sanitize_open(filename, open_mode):
432 """Try to open the given filename, and slightly tweak it if this fails.
434 Attempts to open the given filename. If this fails, it tries to change
435 the filename slightly, step by step, until it's either able to open it
436 or it fails and raises a final exception, like the standard open()
439 It returns the tuple (stream, definitive_file_name).
443 if sys.platform == 'win32':
445 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
446 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
447 stream = open(encodeFilename(filename), open_mode)
448 return (stream, filename)
449 except (IOError, OSError) as err:
450 if err.errno in (errno.EACCES,):
453 # In case of error, try to remove win32 forbidden chars
454 alt_filename = sanitize_path(filename)
455 if alt_filename == filename:
458 # An exception here should be caught in the caller
459 stream = open(encodeFilename(alt_filename), open_mode)
460 return (stream, alt_filename)
463 def timeconvert(timestr):
464 """Convert RFC 2822 defined time string into system timestamp"""
466 timetuple = email.utils.parsedate_tz(timestr)
467 if timetuple is not None:
468 timestamp = email.utils.mktime_tz(timetuple)
472 def sanitize_filename(s, restricted=False, is_id=False):
473 """Sanitizes a string so it could be used as part of a filename.
474 If restricted is set, use a stricter subset of allowed characters.
475 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
477 def replace_insane(char):
478 if restricted and char in ACCENT_CHARS:
479 return ACCENT_CHARS[char]
480 if char == '?' or ord(char) < 32 or ord(char) == 127:
483 return '' if restricted else '\''
485 return '_-' if restricted else ' -'
486 elif char in '\\/|*<>':
488 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
490 if restricted and ord(char) > 127:
495 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
496 result = ''.join(map(replace_insane, s))
498 while '__' in result:
499 result = result.replace('__', '_')
500 result = result.strip('_')
501 # Common case of "Foreign band name - English song title"
502 if restricted and result.startswith('-_'):
504 if result.startswith('-'):
505 result = '_' + result[len('-'):]
506 result = result.lstrip('.')
512 def sanitize_path(s):
513 """Sanitizes and normalizes path on Windows"""
514 if sys.platform != 'win32':
516 drive_or_unc, _ = os.path.splitdrive(s)
517 if sys.version_info < (2, 7) and not drive_or_unc:
518 drive_or_unc, _ = os.path.splitunc(s)
519 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
523 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
524 for path_part in norm_path]
526 sanitized_path.insert(0, drive_or_unc + os.path.sep)
527 return os.path.join(*sanitized_path)
530 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
531 # unwanted failures due to missing protocol
532 def sanitize_url(url):
533 return 'http:%s' % url if url.startswith('//') else url
536 def sanitized_Request(url, *args, **kwargs):
537 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
540 def orderedSet(iterable):
541 """ Remove all duplicates from the input iterable """
549 def _htmlentity_transform(entity_with_semicolon):
550 """Transforms an HTML entity to a character."""
551 entity = entity_with_semicolon[:-1]
553 # Known non-numeric HTML entity
554 if entity in compat_html_entities.name2codepoint:
555 return compat_chr(compat_html_entities.name2codepoint[entity])
557 # TODO: HTML5 allows entities without a semicolon. For example,
558 # 'Éric' should be decoded as 'Éric'.
559 if entity_with_semicolon in compat_html_entities_html5:
560 return compat_html_entities_html5[entity_with_semicolon]
562 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
564 numstr = mobj.group(1)
565 if numstr.startswith('x'):
567 numstr = '0%s' % numstr
570 # See https://github.com/rg3/youtube-dl/issues/7518
572 return compat_chr(int(numstr, base))
576 # Unknown entity in name, return its literal representation
577 return '&%s;' % entity
583 assert type(s) == compat_str
586 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
589 def get_subprocess_encoding():
590 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
591 # For subprocess calls, encode with locale encoding
592 # Refer to http://stackoverflow.com/a/9951851/35070
593 encoding = preferredencoding()
595 encoding = sys.getfilesystemencoding()
601 def encodeFilename(s, for_subprocess=False):
603 @param s The name of the file
606 assert type(s) == compat_str
608 # Python 3 has a Unicode API
609 if sys.version_info >= (3, 0):
612 # Pass '' directly to use Unicode APIs on Windows 2000 and up
613 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
614 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
615 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
618 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
619 if sys.platform.startswith('java'):
622 return s.encode(get_subprocess_encoding(), 'ignore')
625 def decodeFilename(b, for_subprocess=False):
627 if sys.version_info >= (3, 0):
630 if not isinstance(b, bytes):
633 return b.decode(get_subprocess_encoding(), 'ignore')
636 def encodeArgument(s):
637 if not isinstance(s, compat_str):
638 # Legacy code that uses byte strings
639 # Uncomment the following line after fixing all post processors
640 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
641 s = s.decode('ascii')
642 return encodeFilename(s, True)
645 def decodeArgument(b):
646 return decodeFilename(b, True)
649 def decodeOption(optval):
652 if isinstance(optval, bytes):
653 optval = optval.decode(preferredencoding())
655 assert isinstance(optval, compat_str)
659 def formatSeconds(secs):
661 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
663 return '%d:%02d' % (secs // 60, secs % 60)
668 def make_HTTPS_handler(params, **kwargs):
669 opts_no_check_certificate = params.get('nocheckcertificate', False)
670 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
671 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
672 if opts_no_check_certificate:
673 context.check_hostname = False
674 context.verify_mode = ssl.CERT_NONE
676 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
679 # (create_default_context present but HTTPSHandler has no context=)
682 if sys.version_info < (3, 2):
683 return YoutubeDLHTTPSHandler(params, **kwargs)
685 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
686 context.verify_mode = (ssl.CERT_NONE
687 if opts_no_check_certificate
688 else ssl.CERT_REQUIRED)
689 context.set_default_verify_paths()
690 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
693 def bug_reports_message():
694 if ytdl_is_updateable():
695 update_cmd = 'type youtube-dl -U to update'
697 update_cmd = 'see https://yt-dl.org/update on how to update'
698 msg = '; please report this issue on https://yt-dl.org/bug .'
699 msg += ' Make sure you are using the latest version; %s.' % update_cmd
700 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
704 class ExtractorError(Exception):
705 """Error during info extraction."""
707 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
708 """ tb, if given, is the original traceback (so that it can be printed out).
709 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
712 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
714 if video_id is not None:
715 msg = video_id + ': ' + msg
717 msg += ' (caused by %r)' % cause
719 msg += bug_reports_message()
720 super(ExtractorError, self).__init__(msg)
723 self.exc_info = sys.exc_info() # preserve original exception
725 self.video_id = video_id
727 def format_traceback(self):
728 if self.traceback is None:
730 return ''.join(traceback.format_tb(self.traceback))
733 class UnsupportedError(ExtractorError):
734 def __init__(self, url):
735 super(UnsupportedError, self).__init__(
736 'Unsupported URL: %s' % url, expected=True)
740 class RegexNotFoundError(ExtractorError):
741 """Error when a regex didn't match"""
745 class DownloadError(Exception):
746 """Download Error exception.
748 This exception may be thrown by FileDownloader objects if they are not
749 configured to continue on errors. They will contain the appropriate
753 def __init__(self, msg, exc_info=None):
754 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
755 super(DownloadError, self).__init__(msg)
756 self.exc_info = exc_info
759 class SameFileError(Exception):
760 """Same File exception.
762 This exception will be thrown by FileDownloader objects if they detect
763 multiple files would have to be downloaded to the same file on disk.
768 class PostProcessingError(Exception):
769 """Post Processing exception.
771 This exception may be raised by PostProcessor's .run() method to
772 indicate an error in the postprocessing task.
775 def __init__(self, msg):
779 class MaxDownloadsReached(Exception):
780 """ --max-downloads limit has been reached. """
784 class UnavailableVideoError(Exception):
785 """Unavailable Format exception.
787 This exception will be thrown when a video is requested
788 in a format that is not available for that video.
793 class ContentTooShortError(Exception):
794 """Content Too Short exception.
796 This exception may be raised by FileDownloader objects when a file they
797 download is too small for what the server announced first, indicating
798 the connection was probably interrupted.
801 def __init__(self, downloaded, expected):
803 self.downloaded = downloaded
804 self.expected = expected
807 class XAttrMetadataError(Exception):
808 def __init__(self, code=None, msg='Unknown error'):
809 super(XAttrMetadataError, self).__init__(msg)
813 # Parsing code and msg
814 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
815 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
816 self.reason = 'NO_SPACE'
817 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
818 self.reason = 'VALUE_TOO_LONG'
820 self.reason = 'NOT_SUPPORTED'
823 class XAttrUnavailableError(Exception):
827 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
828 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
829 # expected HTTP responses to meet HTTP/1.0 or later (see also
830 # https://github.com/rg3/youtube-dl/issues/6727)
831 if sys.version_info < (3, 0):
832 kwargs[b'strict'] = True
833 hc = http_class(*args, **kwargs)
834 source_address = ydl_handler._params.get('source_address')
835 if source_address is not None:
836 sa = (source_address, 0)
837 if hasattr(hc, 'source_address'): # Python 2.7+
838 hc.source_address = sa
840 def _hc_connect(self, *args, **kwargs):
841 sock = compat_socket_create_connection(
842 (self.host, self.port), self.timeout, sa)
844 self.sock = ssl.wrap_socket(
845 sock, self.key_file, self.cert_file,
846 ssl_version=ssl.PROTOCOL_TLSv1)
849 hc.connect = functools.partial(_hc_connect, hc)
854 def handle_youtubedl_headers(headers):
855 filtered_headers = headers
857 if 'Youtubedl-no-compression' in filtered_headers:
858 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
859 del filtered_headers['Youtubedl-no-compression']
861 return filtered_headers
864 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
865 """Handler for HTTP requests and responses.
867 This class, when installed with an OpenerDirector, automatically adds
868 the standard headers to every HTTP request and handles gzipped and
869 deflated responses from web servers. If compression is to be avoided in
870 a particular request, the original request in the program code only has
871 to include the HTTP header "Youtubedl-no-compression", which will be
872 removed before making the real request.
874 Part of this code was copied from:
876 http://techknack.net/python-urllib2-handlers/
878 Andrew Rowls, the author of that code, agreed to release it to the
882 def __init__(self, params, *args, **kwargs):
883 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
884 self._params = params
886 def http_open(self, req):
887 conn_class = compat_http_client.HTTPConnection
889 socks_proxy = req.headers.get('Ytdl-socks-proxy')
891 conn_class = make_socks_conn_class(conn_class, socks_proxy)
892 del req.headers['Ytdl-socks-proxy']
894 return self.do_open(functools.partial(
895 _create_http_connection, self, conn_class, False),
901 return zlib.decompress(data, -zlib.MAX_WBITS)
903 return zlib.decompress(data)
906 def addinfourl_wrapper(stream, headers, url, code):
907 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
908 return compat_urllib_request.addinfourl(stream, headers, url, code)
909 ret = compat_urllib_request.addinfourl(stream, headers, url)
913 def http_request(self, req):
914 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
915 # always respected by websites, some tend to give out URLs with non percent-encoded
916 # non-ASCII characters (see telemb.py, ard.py [#3412])
917 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
918 # To work around aforementioned issue we will replace request's original URL with
919 # percent-encoded one
920 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
921 # the code of this workaround has been moved here from YoutubeDL.urlopen()
922 url = req.get_full_url()
923 url_escaped = escape_url(url)
925 # Substitute URL if any change after escaping
926 if url != url_escaped:
927 req = update_Request(req, url=url_escaped)
929 for h, v in std_headers.items():
930 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
931 # The dict keys are capitalized because of this bug by urllib
932 if h.capitalize() not in req.headers:
935 req.headers = handle_youtubedl_headers(req.headers)
937 if sys.version_info < (2, 7) and '#' in req.get_full_url():
938 # Python 2.6 is brain-dead when it comes to fragments
939 req._Request__original = req._Request__original.partition('#')[0]
940 req._Request__r_type = req._Request__r_type.partition('#')[0]
944 def http_response(self, req, resp):
947 if resp.headers.get('Content-encoding', '') == 'gzip':
948 content = resp.read()
949 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
951 uncompressed = io.BytesIO(gz.read())
952 except IOError as original_ioerror:
953 # There may be junk add the end of the file
954 # See http://stackoverflow.com/q/4928560/35070 for details
955 for i in range(1, 1024):
957 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
958 uncompressed = io.BytesIO(gz.read())
963 raise original_ioerror
964 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
965 resp.msg = old_resp.msg
966 del resp.headers['Content-encoding']
968 if resp.headers.get('Content-encoding', '') == 'deflate':
969 gz = io.BytesIO(self.deflate(resp.read()))
970 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
971 resp.msg = old_resp.msg
972 del resp.headers['Content-encoding']
973 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
974 # https://github.com/rg3/youtube-dl/issues/6457).
975 if 300 <= resp.code < 400:
976 location = resp.headers.get('Location')
978 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
979 if sys.version_info >= (3, 0):
980 location = location.encode('iso-8859-1').decode('utf-8')
982 location = location.decode('utf-8')
983 location_escaped = escape_url(location)
984 if location != location_escaped:
985 del resp.headers['Location']
986 if sys.version_info < (3, 0):
987 location_escaped = location_escaped.encode('utf-8')
988 resp.headers['Location'] = location_escaped
991 https_request = http_request
992 https_response = http_response
995 def make_socks_conn_class(base_class, socks_proxy):
996 assert issubclass(base_class, (
997 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
999 url_components = compat_urlparse.urlparse(socks_proxy)
1000 if url_components.scheme.lower() == 'socks5':
1001 socks_type = ProxyType.SOCKS5
1002 elif url_components.scheme.lower() in ('socks', 'socks4'):
1003 socks_type = ProxyType.SOCKS4
1004 elif url_components.scheme.lower() == 'socks4a':
1005 socks_type = ProxyType.SOCKS4A
1007 def unquote_if_non_empty(s):
1010 return compat_urllib_parse_unquote_plus(s)
1014 url_components.hostname, url_components.port or 1080,
1016 unquote_if_non_empty(url_components.username),
1017 unquote_if_non_empty(url_components.password),
1020 class SocksConnection(base_class):
1022 self.sock = sockssocket()
1023 self.sock.setproxy(*proxy_args)
1024 if type(self.timeout) in (int, float):
1025 self.sock.settimeout(self.timeout)
1026 self.sock.connect((self.host, self.port))
1028 if isinstance(self, compat_http_client.HTTPSConnection):
1029 if hasattr(self, '_context'): # Python > 2.6
1030 self.sock = self._context.wrap_socket(
1031 self.sock, server_hostname=self.host)
1033 self.sock = ssl.wrap_socket(self.sock)
1035 return SocksConnection
1038 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1039 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1040 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1041 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1042 self._params = params
1044 def https_open(self, req):
1046 conn_class = self._https_conn_class
1048 if hasattr(self, '_context'): # python > 2.6
1049 kwargs['context'] = self._context
1050 if hasattr(self, '_check_hostname'): # python 3.x
1051 kwargs['check_hostname'] = self._check_hostname
1053 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1055 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1056 del req.headers['Ytdl-socks-proxy']
1058 return self.do_open(functools.partial(
1059 _create_http_connection, self, conn_class, True),
1063 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1064 def __init__(self, cookiejar=None):
1065 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1067 def http_response(self, request, response):
1068 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1069 # characters in Set-Cookie HTTP header of last response (see
1070 # https://github.com/rg3/youtube-dl/issues/6769).
1071 # In order to at least prevent crashing we will percent encode Set-Cookie
1072 # header before HTTPCookieProcessor starts processing it.
1073 # if sys.version_info < (3, 0) and response.headers:
1074 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1075 # set_cookie = response.headers.get(set_cookie_header)
1077 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1078 # if set_cookie != set_cookie_escaped:
1079 # del response.headers[set_cookie_header]
1080 # response.headers[set_cookie_header] = set_cookie_escaped
1081 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1083 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1084 https_response = http_response
1087 def extract_timezone(date_str):
1089 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1092 timezone = datetime.timedelta()
1094 date_str = date_str[:-len(m.group('tz'))]
1095 if not m.group('sign'):
1096 timezone = datetime.timedelta()
1098 sign = 1 if m.group('sign') == '+' else -1
1099 timezone = datetime.timedelta(
1100 hours=sign * int(m.group('hours')),
1101 minutes=sign * int(m.group('minutes')))
1102 return timezone, date_str
1105 def parse_iso8601(date_str, delimiter='T', timezone=None):
1106 """ Return a UNIX timestamp from the given date """
1108 if date_str is None:
1111 date_str = re.sub(r'\.[0-9]+', '', date_str)
1113 if timezone is None:
1114 timezone, date_str = extract_timezone(date_str)
1117 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1118 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1119 return calendar.timegm(dt.timetuple())
1124 def date_formats(day_first=True):
1125 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1128 def unified_strdate(date_str, day_first=True):
1129 """Return a string with the date in the format YYYYMMDD"""
1131 if date_str is None:
1135 date_str = date_str.replace(',', ' ')
1136 # Remove AM/PM + timezone
1137 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1138 _, date_str = extract_timezone(date_str)
1140 for expression in date_formats(day_first):
1142 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1145 if upload_date is None:
1146 timetuple = email.utils.parsedate_tz(date_str)
1149 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1152 if upload_date is not None:
1153 return compat_str(upload_date)
1156 def unified_timestamp(date_str, day_first=True):
1157 if date_str is None:
1160 date_str = date_str.replace(',', ' ')
1162 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1163 timezone, date_str = extract_timezone(date_str)
1165 # Remove AM/PM + timezone
1166 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1168 for expression in date_formats(day_first):
1170 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1171 return calendar.timegm(dt.timetuple())
1174 timetuple = email.utils.parsedate_tz(date_str)
1176 return calendar.timegm(timetuple) + pm_delta * 3600
1179 def determine_ext(url, default_ext='unknown_video'):
1182 guess = url.partition('?')[0].rpartition('.')[2]
1183 if re.match(r'^[A-Za-z0-9]+$', guess):
1185 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1186 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1187 return guess.rstrip('/')
1192 def subtitles_filename(filename, sub_lang, sub_format):
1193 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1196 def date_from_str(date_str):
1198 Return a datetime object from a string in the format YYYYMMDD or
1199 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1200 today = datetime.date.today()
1201 if date_str in ('now', 'today'):
1203 if date_str == 'yesterday':
1204 return today - datetime.timedelta(days=1)
1205 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1206 if match is not None:
1207 sign = match.group('sign')
1208 time = int(match.group('time'))
1211 unit = match.group('unit')
1212 # A bad approximation?
1216 elif unit == 'year':
1220 delta = datetime.timedelta(**{unit: time})
1221 return today + delta
1222 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1225 def hyphenate_date(date_str):
1227 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1228 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1229 if match is not None:
1230 return '-'.join(match.groups())
1235 class DateRange(object):
1236 """Represents a time interval between two dates"""
1238 def __init__(self, start=None, end=None):
1239 """start and end must be strings in the format accepted by date"""
1240 if start is not None:
1241 self.start = date_from_str(start)
1243 self.start = datetime.datetime.min.date()
1245 self.end = date_from_str(end)
1247 self.end = datetime.datetime.max.date()
1248 if self.start > self.end:
1249 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1253 """Returns a range that only contains the given day"""
1254 return cls(day, day)
1256 def __contains__(self, date):
1257 """Check if the date is in the range"""
1258 if not isinstance(date, datetime.date):
1259 date = date_from_str(date)
1260 return self.start <= date <= self.end
1263 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1266 def platform_name():
1267 """ Returns the platform name as a compat_str """
1268 res = platform.platform()
1269 if isinstance(res, bytes):
1270 res = res.decode(preferredencoding())
1272 assert isinstance(res, compat_str)
1276 def _windows_write_string(s, out):
1277 """ Returns True if the string was written using special methods,
1278 False if it has yet to be written out."""
1279 # Adapted from http://stackoverflow.com/a/3259271/35070
1282 import ctypes.wintypes
1290 fileno = out.fileno()
1291 except AttributeError:
1292 # If the output stream doesn't have a fileno, it's virtual
1294 except io.UnsupportedOperation:
1295 # Some strange Windows pseudo files?
1297 if fileno not in WIN_OUTPUT_IDS:
1300 GetStdHandle = ctypes.WINFUNCTYPE(
1301 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1302 (b'GetStdHandle', ctypes.windll.kernel32))
1303 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1305 WriteConsoleW = ctypes.WINFUNCTYPE(
1306 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1307 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1308 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1309 written = ctypes.wintypes.DWORD(0)
1311 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1312 FILE_TYPE_CHAR = 0x0002
1313 FILE_TYPE_REMOTE = 0x8000
1314 GetConsoleMode = ctypes.WINFUNCTYPE(
1315 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1316 ctypes.POINTER(ctypes.wintypes.DWORD))(
1317 (b'GetConsoleMode', ctypes.windll.kernel32))
1318 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1320 def not_a_console(handle):
1321 if handle == INVALID_HANDLE_VALUE or handle is None:
1323 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1324 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1326 if not_a_console(h):
1329 def next_nonbmp_pos(s):
1331 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1332 except StopIteration:
1336 count = min(next_nonbmp_pos(s), 1024)
1338 ret = WriteConsoleW(
1339 h, s, count if count else 2, ctypes.byref(written), None)
1341 raise OSError('Failed to write string')
1342 if not count: # We just wrote a non-BMP character
1343 assert written.value == 2
1346 assert written.value > 0
1347 s = s[written.value:]
1351 def write_string(s, out=None, encoding=None):
1354 assert type(s) == compat_str
1356 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1357 if _windows_write_string(s, out):
1360 if ('b' in getattr(out, 'mode', '') or
1361 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1362 byt = s.encode(encoding or preferredencoding(), 'ignore')
1364 elif hasattr(out, 'buffer'):
1365 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1366 byt = s.encode(enc, 'ignore')
1367 out.buffer.write(byt)
1373 def bytes_to_intlist(bs):
1376 if isinstance(bs[0], int): # Python 3
1379 return [ord(c) for c in bs]
1382 def intlist_to_bytes(xs):
1385 return compat_struct_pack('%dB' % len(xs), *xs)
1388 # Cross-platform file locking
1389 if sys.platform == 'win32':
1390 import ctypes.wintypes
1393 class OVERLAPPED(ctypes.Structure):
1395 ('Internal', ctypes.wintypes.LPVOID),
1396 ('InternalHigh', ctypes.wintypes.LPVOID),
1397 ('Offset', ctypes.wintypes.DWORD),
1398 ('OffsetHigh', ctypes.wintypes.DWORD),
1399 ('hEvent', ctypes.wintypes.HANDLE),
1402 kernel32 = ctypes.windll.kernel32
1403 LockFileEx = kernel32.LockFileEx
1404 LockFileEx.argtypes = [
1405 ctypes.wintypes.HANDLE, # hFile
1406 ctypes.wintypes.DWORD, # dwFlags
1407 ctypes.wintypes.DWORD, # dwReserved
1408 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1409 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1410 ctypes.POINTER(OVERLAPPED) # Overlapped
1412 LockFileEx.restype = ctypes.wintypes.BOOL
1413 UnlockFileEx = kernel32.UnlockFileEx
1414 UnlockFileEx.argtypes = [
1415 ctypes.wintypes.HANDLE, # hFile
1416 ctypes.wintypes.DWORD, # dwReserved
1417 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1418 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1419 ctypes.POINTER(OVERLAPPED) # Overlapped
1421 UnlockFileEx.restype = ctypes.wintypes.BOOL
1422 whole_low = 0xffffffff
1423 whole_high = 0x7fffffff
1425 def _lock_file(f, exclusive):
1426 overlapped = OVERLAPPED()
1427 overlapped.Offset = 0
1428 overlapped.OffsetHigh = 0
1429 overlapped.hEvent = 0
1430 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1431 handle = msvcrt.get_osfhandle(f.fileno())
1432 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1433 whole_low, whole_high, f._lock_file_overlapped_p):
1434 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1436 def _unlock_file(f):
1437 assert f._lock_file_overlapped_p
1438 handle = msvcrt.get_osfhandle(f.fileno())
1439 if not UnlockFileEx(handle, 0,
1440 whole_low, whole_high, f._lock_file_overlapped_p):
1441 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1444 # Some platforms, such as Jython, is missing fcntl
1448 def _lock_file(f, exclusive):
1449 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1451 def _unlock_file(f):
1452 fcntl.flock(f, fcntl.LOCK_UN)
1454 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1456 def _lock_file(f, exclusive):
1457 raise IOError(UNSUPPORTED_MSG)
1459 def _unlock_file(f):
1460 raise IOError(UNSUPPORTED_MSG)
1463 class locked_file(object):
1464 def __init__(self, filename, mode, encoding=None):
1465 assert mode in ['r', 'a', 'w']
1466 self.f = io.open(filename, mode, encoding=encoding)
1469 def __enter__(self):
1470 exclusive = self.mode != 'r'
1472 _lock_file(self.f, exclusive)
1478 def __exit__(self, etype, value, traceback):
1480 _unlock_file(self.f)
1487 def write(self, *args):
1488 return self.f.write(*args)
1490 def read(self, *args):
1491 return self.f.read(*args)
1494 def get_filesystem_encoding():
1495 encoding = sys.getfilesystemencoding()
1496 return encoding if encoding is not None else 'utf-8'
1499 def shell_quote(args):
1501 encoding = get_filesystem_encoding()
1503 if isinstance(a, bytes):
1504 # We may get a filename encoded with 'encodeFilename'
1505 a = a.decode(encoding)
1506 quoted_args.append(pipes.quote(a))
1507 return ' '.join(quoted_args)
1510 def smuggle_url(url, data):
1511 """ Pass additional data in a URL for internal use. """
1513 url, idata = unsmuggle_url(url, {})
1515 sdata = compat_urllib_parse_urlencode(
1516 {'__youtubedl_smuggle': json.dumps(data)})
1517 return url + '#' + sdata
1520 def unsmuggle_url(smug_url, default=None):
1521 if '#__youtubedl_smuggle' not in smug_url:
1522 return smug_url, default
1523 url, _, sdata = smug_url.rpartition('#')
1524 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1525 data = json.loads(jsond)
1529 def format_bytes(bytes):
1532 if type(bytes) is str:
1533 bytes = float(bytes)
1537 exponent = int(math.log(bytes, 1024.0))
1538 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1539 converted = float(bytes) / float(1024 ** exponent)
1540 return '%.2f%s' % (converted, suffix)
1543 def lookup_unit_table(unit_table, s):
1544 units_re = '|'.join(re.escape(u) for u in unit_table)
1546 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1549 num_str = m.group('num').replace(',', '.')
1550 mult = unit_table[m.group('unit')]
1551 return int(float(num_str) * mult)
1554 def parse_filesize(s):
1558 # The lower-case forms are of course incorrect and unofficial,
1559 # but we support those too
1576 'megabytes': 1000 ** 2,
1577 'mebibytes': 1024 ** 2,
1583 'gigabytes': 1000 ** 3,
1584 'gibibytes': 1024 ** 3,
1590 'terabytes': 1000 ** 4,
1591 'tebibytes': 1024 ** 4,
1597 'petabytes': 1000 ** 5,
1598 'pebibytes': 1024 ** 5,
1604 'exabytes': 1000 ** 6,
1605 'exbibytes': 1024 ** 6,
1611 'zettabytes': 1000 ** 7,
1612 'zebibytes': 1024 ** 7,
1618 'yottabytes': 1000 ** 8,
1619 'yobibytes': 1024 ** 8,
1622 return lookup_unit_table(_UNIT_TABLE, s)
1631 if re.match(r'^[\d,.]+$', s):
1632 return str_to_int(s)
1643 return lookup_unit_table(_UNIT_TABLE, s)
1646 def month_by_name(name, lang='en'):
1647 """ Return the number of a month by (locale-independently) English name """
1649 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
1652 return month_names.index(name) + 1
1657 def month_by_abbreviation(abbrev):
1658 """ Return the number of a month by (locale-independently) English
1662 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1667 def fix_xml_ampersands(xml_str):
1668 """Replace all the '&' by '&' in XML"""
1670 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1675 def setproctitle(title):
1676 assert isinstance(title, compat_str)
1678 # ctypes in Jython is not complete
1679 # http://bugs.jython.org/issue2148
1680 if sys.platform.startswith('java'):
1684 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1687 title_bytes = title.encode('utf-8')
1688 buf = ctypes.create_string_buffer(len(title_bytes))
1689 buf.value = title_bytes
1691 libc.prctl(15, buf, 0, 0, 0)
1692 except AttributeError:
1693 return # Strange libc, just skip this
1696 def remove_start(s, start):
1697 return s[len(start):] if s is not None and s.startswith(start) else s
1700 def remove_end(s, end):
1701 return s[:-len(end)] if s is not None and s.endswith(end) else s
1704 def remove_quotes(s):
1705 if s is None or len(s) < 2:
1707 for quote in ('"', "'", ):
1708 if s[0] == quote and s[-1] == quote:
1713 def url_basename(url):
1714 path = compat_urlparse.urlparse(url).path
1715 return path.strip('/').split('/')[-1]
1719 return re.match(r'https?://[^?#&]+/', url).group()
1722 def urljoin(base, path):
1723 if not isinstance(path, compat_str) or not path:
1725 if re.match(r'^(?:https?:)?//', path):
1727 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
1729 return compat_urlparse.urljoin(base, path)
1732 class HEADRequest(compat_urllib_request.Request):
1733 def get_method(self):
1737 class PUTRequest(compat_urllib_request.Request):
1738 def get_method(self):
1742 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1745 v = getattr(v, get_attr, None)
1751 return int(v) * invscale // scale
1756 def str_or_none(v, default=None):
1757 return default if v is None else compat_str(v)
1760 def str_to_int(int_str):
1761 """ A more relaxed version of int_or_none """
1764 int_str = re.sub(r'[,\.\+]', '', int_str)
1768 def float_or_none(v, scale=1, invscale=1, default=None):
1772 return float(v) * invscale / scale
1777 def strip_or_none(v):
1778 return None if v is None else v.strip()
1781 def parse_duration(s):
1782 if not isinstance(s, compat_basestring):
1787 days, hours, mins, secs, ms = [None] * 5
1788 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
1790 days, hours, mins, secs, ms = m.groups()
1795 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1798 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1801 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1804 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1807 days, hours, mins, secs, ms = m.groups()
1809 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
1811 hours, mins = m.groups()
1817 duration += float(secs)
1819 duration += float(mins) * 60
1821 duration += float(hours) * 60 * 60
1823 duration += float(days) * 24 * 60 * 60
1825 duration += float(ms)
1829 def prepend_extension(filename, ext, expected_real_ext=None):
1830 name, real_ext = os.path.splitext(filename)
1832 '{0}.{1}{2}'.format(name, ext, real_ext)
1833 if not expected_real_ext or real_ext[1:] == expected_real_ext
1834 else '{0}.{1}'.format(filename, ext))
1837 def replace_extension(filename, ext, expected_real_ext=None):
1838 name, real_ext = os.path.splitext(filename)
1839 return '{0}.{1}'.format(
1840 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1844 def check_executable(exe, args=[]):
1845 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1846 args can be a list of arguments for a short output (like -version) """
1848 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1854 def get_exe_version(exe, args=['--version'],
1855 version_re=None, unrecognized='present'):
1856 """ Returns the version of the specified executable,
1857 or False if the executable is not present """
1859 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1860 # SIGTTOU if youtube-dl is run in the background.
1861 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
1862 out, _ = subprocess.Popen(
1863 [encodeArgument(exe)] + args,
1864 stdin=subprocess.PIPE,
1865 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1868 if isinstance(out, bytes): # Python 2.x
1869 out = out.decode('ascii', 'ignore')
1870 return detect_exe_version(out, version_re, unrecognized)
1873 def detect_exe_version(output, version_re=None, unrecognized='present'):
1874 assert isinstance(output, compat_str)
1875 if version_re is None:
1876 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1877 m = re.search(version_re, output)
1884 class PagedList(object):
1886 # This is only useful for tests
1887 return len(self.getslice())
1890 class OnDemandPagedList(PagedList):
1891 def __init__(self, pagefunc, pagesize, use_cache=False):
1892 self._pagefunc = pagefunc
1893 self._pagesize = pagesize
1894 self._use_cache = use_cache
1898 def getslice(self, start=0, end=None):
1900 for pagenum in itertools.count(start // self._pagesize):
1901 firstid = pagenum * self._pagesize
1902 nextfirstid = pagenum * self._pagesize + self._pagesize
1903 if start >= nextfirstid:
1908 page_results = self._cache.get(pagenum)
1909 if page_results is None:
1910 page_results = list(self._pagefunc(pagenum))
1912 self._cache[pagenum] = page_results
1915 start % self._pagesize
1916 if firstid <= start < nextfirstid
1920 ((end - 1) % self._pagesize) + 1
1921 if (end is not None and firstid <= end <= nextfirstid)
1924 if startv != 0 or endv is not None:
1925 page_results = page_results[startv:endv]
1926 res.extend(page_results)
1928 # A little optimization - if current page is not "full", ie. does
1929 # not contain page_size videos then we can assume that this page
1930 # is the last one - there are no more ids on further pages -
1931 # i.e. no need to query again.
1932 if len(page_results) + startv < self._pagesize:
1935 # If we got the whole page, but the next page is not interesting,
1936 # break out early as well
1937 if end == nextfirstid:
1942 class InAdvancePagedList(PagedList):
1943 def __init__(self, pagefunc, pagecount, pagesize):
1944 self._pagefunc = pagefunc
1945 self._pagecount = pagecount
1946 self._pagesize = pagesize
1948 def getslice(self, start=0, end=None):
1950 start_page = start // self._pagesize
1952 self._pagecount if end is None else (end // self._pagesize + 1))
1953 skip_elems = start - start_page * self._pagesize
1954 only_more = None if end is None else end - start
1955 for pagenum in range(start_page, end_page):
1956 page = list(self._pagefunc(pagenum))
1958 page = page[skip_elems:]
1960 if only_more is not None:
1961 if len(page) < only_more:
1962 only_more -= len(page)
1964 page = page[:only_more]
1971 def uppercase_escape(s):
1972 unicode_escape = codecs.getdecoder('unicode_escape')
1974 r'\\U[0-9a-fA-F]{8}',
1975 lambda m: unicode_escape(m.group(0))[0],
1979 def lowercase_escape(s):
1980 unicode_escape = codecs.getdecoder('unicode_escape')
1982 r'\\u[0-9a-fA-F]{4}',
1983 lambda m: unicode_escape(m.group(0))[0],
1987 def escape_rfc3986(s):
1988 """Escape non-ASCII characters as suggested by RFC 3986"""
1989 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1990 s = s.encode('utf-8')
1991 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1994 def escape_url(url):
1995 """Escape URL as suggested by RFC 3986"""
1996 url_parsed = compat_urllib_parse_urlparse(url)
1997 return url_parsed._replace(
1998 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1999 path=escape_rfc3986(url_parsed.path),
2000 params=escape_rfc3986(url_parsed.params),
2001 query=escape_rfc3986(url_parsed.query),
2002 fragment=escape_rfc3986(url_parsed.fragment)
2006 def read_batch_urls(batch_fd):
2008 if not isinstance(url, compat_str):
2009 url = url.decode('utf-8', 'replace')
2010 BOM_UTF8 = '\xef\xbb\xbf'
2011 if url.startswith(BOM_UTF8):
2012 url = url[len(BOM_UTF8):]
2014 if url.startswith(('#', ';', ']')):
2018 with contextlib.closing(batch_fd) as fd:
2019 return [url for url in map(fixup, fd) if url]
2022 def urlencode_postdata(*args, **kargs):
2023 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2026 def update_url_query(url, query):
2029 parsed_url = compat_urlparse.urlparse(url)
2030 qs = compat_parse_qs(parsed_url.query)
2032 return compat_urlparse.urlunparse(parsed_url._replace(
2033 query=compat_urllib_parse_urlencode(qs, True)))
2036 def update_Request(req, url=None, data=None, headers={}, query={}):
2037 req_headers = req.headers.copy()
2038 req_headers.update(headers)
2039 req_data = data or req.data
2040 req_url = update_url_query(url or req.get_full_url(), query)
2041 req_get_method = req.get_method()
2042 if req_get_method == 'HEAD':
2043 req_type = HEADRequest
2044 elif req_get_method == 'PUT':
2045 req_type = PUTRequest
2047 req_type = compat_urllib_request.Request
2049 req_url, data=req_data, headers=req_headers,
2050 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2051 if hasattr(req, 'timeout'):
2052 new_req.timeout = req.timeout
2056 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2057 if isinstance(key_or_keys, (list, tuple)):
2058 for key in key_or_keys:
2059 if key not in d or d[key] is None or skip_false_values and not d[key]:
2063 return d.get(key_or_keys, default)
2066 def try_get(src, getter, expected_type=None):
2069 except (AttributeError, KeyError, TypeError, IndexError):
2072 if expected_type is None or isinstance(v, expected_type):
2076 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2077 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2089 TV_PARENTAL_GUIDELINES = {
2099 def parse_age_limit(s):
2101 return s if 0 <= s <= 21 else None
2102 if not isinstance(s, compat_basestring):
2104 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2106 return int(m.group('age'))
2108 return US_RATINGS[s]
2109 return TV_PARENTAL_GUIDELINES.get(s)
2112 def strip_jsonp(code):
2114 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
2117 def js_to_json(code):
2118 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2119 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2121 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2122 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2127 if v in ('true', 'false', 'null'):
2129 elif v.startswith('/*') or v.startswith('//') or v == ',':
2132 if v[0] in ("'", '"'):
2133 v = re.sub(r'(?s)\\.|"', lambda m: {
2138 }.get(m.group(0), m.group(0)), v[1:-1])
2140 for regex, base in INTEGER_TABLE:
2141 im = re.match(regex, v)
2143 i = int(im.group(1), base)
2144 return '"%d":' % i if v.endswith(':') else '%d' % i
2148 return re.sub(r'''(?sx)
2149 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2150 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2151 {comment}|,(?={skip}[\]}}])|
2152 [a-zA-Z_][.a-zA-Z_0-9]*|
2153 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2155 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2158 def qualities(quality_ids):
2159 """ Get a numeric quality value out of a list of possible values """
2162 return quality_ids.index(qid)
2168 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2171 def limit_length(s, length):
2172 """ Add ellipses to overly long strings """
2177 return s[:length - len(ELLIPSES)] + ELLIPSES
2181 def version_tuple(v):
2182 return tuple(int(e) for e in re.split(r'[-.]', v))
2185 def is_outdated_version(version, limit, assume_new=True):
2187 return not assume_new
2189 return version_tuple(version) < version_tuple(limit)
2191 return not assume_new
2194 def ytdl_is_updateable():
2195 """ Returns if youtube-dl can be updated with -U """
2196 from zipimport import zipimporter
2198 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2201 def args_to_str(args):
2202 # Get a short string representation for a subprocess command
2203 return ' '.join(compat_shlex_quote(a) for a in args)
2206 def error_to_compat_str(err):
2208 # On python 2 error byte string must be decoded with proper
2209 # encoding rather than ascii
2210 if sys.version_info[0] < 3:
2211 err_str = err_str.decode(preferredencoding())
2215 def mimetype2ext(mt):
2221 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2222 # it's the most popular one
2223 'audio/mpeg': 'mp3',
2228 _, _, res = mt.rpartition('/')
2229 res = res.split(';')[0].strip().lower()
2233 'smptett+xml': 'tt',
2239 'x-mp4-fragmented': 'mp4',
2242 'x-mpegurl': 'm3u8',
2243 'vnd.apple.mpegurl': 'm3u8',
2248 'vnd.ms-sstr+xml': 'ism',
2253 def parse_codecs(codecs_str):
2254 # http://tools.ietf.org/html/rfc6381
2257 splited_codecs = list(filter(None, map(
2258 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2259 vcodec, acodec = None, None
2260 for full_codec in splited_codecs:
2261 codec = full_codec.split('.')[0]
2262 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2265 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
2269 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2270 if not vcodec and not acodec:
2271 if len(splited_codecs) == 2:
2276 elif len(splited_codecs) == 1:
2283 'vcodec': vcodec or 'none',
2284 'acodec': acodec or 'none',
2289 def urlhandle_detect_ext(url_handle):
2290 getheader = url_handle.headers.get
2292 cd = getheader('Content-Disposition')
2294 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2296 e = determine_ext(m.group('filename'), default_ext=None)
2300 return mimetype2ext(getheader('Content-Type'))
2303 def encode_data_uri(data, mime_type):
2304 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2307 def age_restricted(content_limit, age_limit):
2308 """ Returns True iff the content should be blocked """
2310 if age_limit is None: # No limit set
2312 if content_limit is None:
2313 return False # Content available for everyone
2314 return age_limit < content_limit
2317 def is_html(first_bytes):
2318 """ Detect whether a file contains HTML by examining its first bytes. """
2321 (b'\xef\xbb\xbf', 'utf-8'),
2322 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2323 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2324 (b'\xff\xfe', 'utf-16-le'),
2325 (b'\xfe\xff', 'utf-16-be'),
2327 for bom, enc in BOMS:
2328 if first_bytes.startswith(bom):
2329 s = first_bytes[len(bom):].decode(enc, 'replace')
2332 s = first_bytes.decode('utf-8', 'replace')
2334 return re.match(r'^\s*<', s)
2337 def determine_protocol(info_dict):
2338 protocol = info_dict.get('protocol')
2339 if protocol is not None:
2342 url = info_dict['url']
2343 if url.startswith('rtmp'):
2345 elif url.startswith('mms'):
2347 elif url.startswith('rtsp'):
2350 ext = determine_ext(url)
2356 return compat_urllib_parse_urlparse(url).scheme
2359 def render_table(header_row, data):
2360 """ Render a list of rows, each as a list of values """
2361 table = [header_row] + data
2362 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2363 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2364 return '\n'.join(format_str % tuple(row) for row in table)
2367 def _match_one(filter_part, dct):
2368 COMPARISON_OPERATORS = {
2376 operator_rex = re.compile(r'''(?x)\s*
2378 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2380 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2381 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2384 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2385 m = operator_rex.search(filter_part)
2387 op = COMPARISON_OPERATORS[m.group('op')]
2388 actual_value = dct.get(m.group('key'))
2389 if (m.group('strval') is not None or
2390 # If the original field is a string and matching comparisonvalue is
2391 # a number we should respect the origin of the original field
2392 # and process comparison value as a string (see
2393 # https://github.com/rg3/youtube-dl/issues/11082).
2394 actual_value is not None and m.group('intval') is not None and
2395 isinstance(actual_value, compat_str)):
2396 if m.group('op') not in ('=', '!='):
2398 'Operator %s does not support string values!' % m.group('op'))
2399 comparison_value = m.group('strval') or m.group('intval')
2402 comparison_value = int(m.group('intval'))
2404 comparison_value = parse_filesize(m.group('intval'))
2405 if comparison_value is None:
2406 comparison_value = parse_filesize(m.group('intval') + 'B')
2407 if comparison_value is None:
2409 'Invalid integer value %r in filter part %r' % (
2410 m.group('intval'), filter_part))
2411 if actual_value is None:
2412 return m.group('none_inclusive')
2413 return op(actual_value, comparison_value)
2416 '': lambda v: v is not None,
2417 '!': lambda v: v is None,
2419 operator_rex = re.compile(r'''(?x)\s*
2420 (?P<op>%s)\s*(?P<key>[a-z_]+)
2422 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2423 m = operator_rex.search(filter_part)
2425 op = UNARY_OPERATORS[m.group('op')]
2426 actual_value = dct.get(m.group('key'))
2427 return op(actual_value)
2429 raise ValueError('Invalid filter part %r' % filter_part)
2432 def match_str(filter_str, dct):
2433 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2436 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2439 def match_filter_func(filter_str):
2440 def _match_func(info_dict):
2441 if match_str(filter_str, info_dict):
2444 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2445 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2449 def parse_dfxp_time_expr(time_expr):
2453 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2455 return float(mobj.group('time_offset'))
2457 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2459 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2462 def srt_subtitles_timecode(seconds):
2463 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2466 def dfxp2srt(dfxp_data):
2467 _x = functools.partial(xpath_with_ns, ns_map={
2468 'ttml': 'http://www.w3.org/ns/ttml',
2469 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2470 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2473 class TTMLPElementParser(object):
2476 def start(self, tag, attrib):
2477 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2483 def data(self, data):
2487 return self.out.strip()
2489 def parse_node(node):
2490 target = TTMLPElementParser()
2491 parser = xml.etree.ElementTree.XMLParser(target=target)
2492 parser.feed(xml.etree.ElementTree.tostring(node))
2493 return parser.close()
2495 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2497 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2500 raise ValueError('Invalid dfxp/TTML subtitle')
2502 for para, index in zip(paras, itertools.count(1)):
2503 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2504 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2505 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2506 if begin_time is None:
2511 end_time = begin_time + dur
2512 out.append('%d\n%s --> %s\n%s\n\n' % (
2514 srt_subtitles_timecode(begin_time),
2515 srt_subtitles_timecode(end_time),
2521 def cli_option(params, command_option, param):
2522 param = params.get(param)
2524 param = compat_str(param)
2525 return [command_option, param] if param is not None else []
2528 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2529 param = params.get(param)
2530 assert isinstance(param, bool)
2532 return [command_option + separator + (true_value if param else false_value)]
2533 return [command_option, true_value if param else false_value]
2536 def cli_valueless_option(params, command_option, param, expected_value=True):
2537 param = params.get(param)
2538 return [command_option] if param == expected_value else []
2541 def cli_configuration_args(params, param, default=[]):
2542 ex_args = params.get(param)
2545 assert isinstance(ex_args, list)
2549 class ISO639Utils(object):
2550 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2739 def short2long(cls, code):
2740 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2741 return cls._lang_map.get(code[:2])
2744 def long2short(cls, code):
2745 """Convert language code from ISO 639-2/T to ISO 639-1"""
2746 for short_name, long_name in cls._lang_map.items():
2747 if long_name == code:
2751 class ISO3166Utils(object):
2752 # From http://data.okfn.org/data/core/country-list
2754 'AF': 'Afghanistan',
2755 'AX': 'Åland Islands',
2758 'AS': 'American Samoa',
2763 'AG': 'Antigua and Barbuda',
2780 'BO': 'Bolivia, Plurinational State of',
2781 'BQ': 'Bonaire, Sint Eustatius and Saba',
2782 'BA': 'Bosnia and Herzegovina',
2784 'BV': 'Bouvet Island',
2786 'IO': 'British Indian Ocean Territory',
2787 'BN': 'Brunei Darussalam',
2789 'BF': 'Burkina Faso',
2795 'KY': 'Cayman Islands',
2796 'CF': 'Central African Republic',
2800 'CX': 'Christmas Island',
2801 'CC': 'Cocos (Keeling) Islands',
2805 'CD': 'Congo, the Democratic Republic of the',
2806 'CK': 'Cook Islands',
2808 'CI': 'Côte d\'Ivoire',
2813 'CZ': 'Czech Republic',
2817 'DO': 'Dominican Republic',
2820 'SV': 'El Salvador',
2821 'GQ': 'Equatorial Guinea',
2825 'FK': 'Falkland Islands (Malvinas)',
2826 'FO': 'Faroe Islands',
2830 'GF': 'French Guiana',
2831 'PF': 'French Polynesia',
2832 'TF': 'French Southern Territories',
2847 'GW': 'Guinea-Bissau',
2850 'HM': 'Heard Island and McDonald Islands',
2851 'VA': 'Holy See (Vatican City State)',
2858 'IR': 'Iran, Islamic Republic of',
2861 'IM': 'Isle of Man',
2871 'KP': 'Korea, Democratic People\'s Republic of',
2872 'KR': 'Korea, Republic of',
2875 'LA': 'Lao People\'s Democratic Republic',
2881 'LI': 'Liechtenstein',
2885 'MK': 'Macedonia, the Former Yugoslav Republic of',
2892 'MH': 'Marshall Islands',
2898 'FM': 'Micronesia, Federated States of',
2899 'MD': 'Moldova, Republic of',
2910 'NL': 'Netherlands',
2911 'NC': 'New Caledonia',
2912 'NZ': 'New Zealand',
2917 'NF': 'Norfolk Island',
2918 'MP': 'Northern Mariana Islands',
2923 'PS': 'Palestine, State of',
2925 'PG': 'Papua New Guinea',
2928 'PH': 'Philippines',
2932 'PR': 'Puerto Rico',
2936 'RU': 'Russian Federation',
2938 'BL': 'Saint Barthélemy',
2939 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2940 'KN': 'Saint Kitts and Nevis',
2941 'LC': 'Saint Lucia',
2942 'MF': 'Saint Martin (French part)',
2943 'PM': 'Saint Pierre and Miquelon',
2944 'VC': 'Saint Vincent and the Grenadines',
2947 'ST': 'Sao Tome and Principe',
2948 'SA': 'Saudi Arabia',
2952 'SL': 'Sierra Leone',
2954 'SX': 'Sint Maarten (Dutch part)',
2957 'SB': 'Solomon Islands',
2959 'ZA': 'South Africa',
2960 'GS': 'South Georgia and the South Sandwich Islands',
2961 'SS': 'South Sudan',
2966 'SJ': 'Svalbard and Jan Mayen',
2969 'CH': 'Switzerland',
2970 'SY': 'Syrian Arab Republic',
2971 'TW': 'Taiwan, Province of China',
2973 'TZ': 'Tanzania, United Republic of',
2975 'TL': 'Timor-Leste',
2979 'TT': 'Trinidad and Tobago',
2982 'TM': 'Turkmenistan',
2983 'TC': 'Turks and Caicos Islands',
2987 'AE': 'United Arab Emirates',
2988 'GB': 'United Kingdom',
2989 'US': 'United States',
2990 'UM': 'United States Minor Outlying Islands',
2994 'VE': 'Venezuela, Bolivarian Republic of',
2996 'VG': 'Virgin Islands, British',
2997 'VI': 'Virgin Islands, U.S.',
2998 'WF': 'Wallis and Futuna',
2999 'EH': 'Western Sahara',
3006 def short2full(cls, code):
3007 """Convert an ISO 3166-2 country code to the corresponding full name"""
3008 return cls._country_map.get(code.upper())
3011 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
3012 def __init__(self, proxies=None):
3013 # Set default handlers
3014 for type in ('http', 'https'):
3015 setattr(self, '%s_open' % type,
3016 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3017 meth(r, proxy, type))
3018 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3020 def proxy_open(self, req, proxy, type):
3021 req_proxy = req.headers.get('Ytdl-request-proxy')
3022 if req_proxy is not None:
3024 del req.headers['Ytdl-request-proxy']
3026 if proxy == '__noproxy__':
3027 return None # No Proxy
3028 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
3029 req.add_header('Ytdl-socks-proxy', proxy)
3030 # youtube-dl's http/https handlers do wrapping the socket with socks
3032 return compat_urllib_request.ProxyHandler.proxy_open(
3033 self, req, proxy, type)
3036 def ohdave_rsa_encrypt(data, exponent, modulus):
3038 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3041 data: data to encrypt, bytes-like object
3042 exponent, modulus: parameter e and N of RSA algorithm, both integer
3043 Output: hex string of encrypted data
3045 Limitation: supports one block encryption only
3048 payload = int(binascii.hexlify(data[::-1]), 16)
3049 encrypted = pow(payload, exponent, modulus)
3050 return '%x' % encrypted
3053 def encode_base_n(num, n, table=None):
3054 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
3056 table = FULL_TABLE[:n]
3059 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3066 ret = table[num % n] + ret
3071 def decode_packed_codes(code):
3072 mobj = re.search(PACKED_CODES_RE, code)
3073 obfucasted_code, base, count, symbols = mobj.groups()
3076 symbols = symbols.split('|')
3081 base_n_count = encode_base_n(count, base)
3082 symbol_table[base_n_count] = symbols[count] or base_n_count
3085 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3089 def parse_m3u8_attributes(attrib):
3091 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3092 if val.startswith('"'):
3098 def urshift(val, n):
3099 return val >> n if val >= 0 else (val + 0x100000000) >> n
3102 # Based on png2str() written by @gdkchan and improved by @yokrysty
3103 # Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3104 def decode_png(png_data):
3105 # Reference: https://www.w3.org/TR/PNG/
3106 header = png_data[8:]
3108 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3109 raise IOError('Not a valid PNG file.')
3111 int_map = {1: '>B', 2: '>H', 4: '>I'}
3112 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3117 length = unpack_integer(header[:4])
3120 chunk_type = header[:4]
3123 chunk_data = header[:length]
3124 header = header[length:]
3126 header = header[4:] # Skip CRC
3134 ihdr = chunks[0]['data']
3136 width = unpack_integer(ihdr[:4])
3137 height = unpack_integer(ihdr[4:8])
3141 for chunk in chunks:
3142 if chunk['type'] == b'IDAT':
3143 idat += chunk['data']
3146 raise IOError('Unable to read PNG data.')
3148 decompressed_data = bytearray(zlib.decompress(idat))
3153 def _get_pixel(idx):
3158 for y in range(height):
3159 basePos = y * (1 + stride)
3160 filter_type = decompressed_data[basePos]
3164 pixels.append(current_row)
3166 for x in range(stride):
3167 color = decompressed_data[1 + basePos + x]
3168 basex = y * stride + x
3173 left = _get_pixel(basex - 3)
3175 up = _get_pixel(basex - stride)
3177 if filter_type == 1: # Sub
3178 color = (color + left) & 0xff
3179 elif filter_type == 2: # Up
3180 color = (color + up) & 0xff
3181 elif filter_type == 3: # Average
3182 color = (color + ((left + up) >> 1)) & 0xff
3183 elif filter_type == 4: # Paeth
3189 c = _get_pixel(basex - stride - 3)
3197 if pa <= pb and pa <= pc:
3198 color = (color + a) & 0xff
3200 color = (color + b) & 0xff
3202 color = (color + c) & 0xff
3204 current_row.append(color)
3206 return width, height, pixels
3209 def write_xattr(path, key, value):
3210 # This mess below finds the best xattr tool for the job
3212 # try the pyxattr module...
3215 if hasattr(xattr, 'set'): # pyxattr
3216 # Unicode arguments are not supported in python-pyxattr until
3218 # See https://github.com/rg3/youtube-dl/issues/5498
3219 pyxattr_required_version = '0.5.0'
3220 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3221 # TODO: fallback to CLI tools
3222 raise XAttrUnavailableError(
3223 'python-pyxattr is detected but is too old. '
3224 'youtube-dl requires %s or above while your version is %s. '
3225 'Falling back to other xattr implementations' % (
3226 pyxattr_required_version, xattr.__version__))
3228 setxattr = xattr.set
3230 setxattr = xattr.setxattr
3233 setxattr(path, key, value)
3234 except EnvironmentError as e:
3235 raise XAttrMetadataError(e.errno, e.strerror)
3238 if compat_os_name == 'nt':
3239 # Write xattrs to NTFS Alternate Data Streams:
3240 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3241 assert ':' not in key
3242 assert os.path.exists(path)
3244 ads_fn = path + ':' + key
3246 with open(ads_fn, 'wb') as f:
3248 except EnvironmentError as e:
3249 raise XAttrMetadataError(e.errno, e.strerror)
3251 user_has_setfattr = check_executable('setfattr', ['--version'])
3252 user_has_xattr = check_executable('xattr', ['-h'])
3254 if user_has_setfattr or user_has_xattr:
3256 value = value.decode('utf-8')
3257 if user_has_setfattr:
3258 executable = 'setfattr'
3259 opts = ['-n', key, '-v', value]
3260 elif user_has_xattr:
3261 executable = 'xattr'
3262 opts = ['-w', key, value]
3264 cmd = ([encodeFilename(executable, True)] +
3265 [encodeArgument(o) for o in opts] +
3266 [encodeFilename(path, True)])
3269 p = subprocess.Popen(
3270 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3271 except EnvironmentError as e:
3272 raise XAttrMetadataError(e.errno, e.strerror)
3273 stdout, stderr = p.communicate()
3274 stderr = stderr.decode('utf-8', 'replace')
3275 if p.returncode != 0:
3276 raise XAttrMetadataError(p.returncode, stderr)
3279 # On Unix, and can't find pyxattr, setfattr, or xattr.
3280 if sys.platform.startswith('linux'):
3281 raise XAttrUnavailableError(
3282 "Couldn't find a tool to set the xattrs. "
3283 "Install either the python 'pyxattr' or 'xattr' "
3284 "modules, or the GNU 'attr' package "
3285 "(which contains the 'setfattr' tool).")
3287 raise XAttrUnavailableError(
3288 "Couldn't find a tool to set the xattrs. "
3289 "Install either the python 'xattr' module, "
3290 "or the 'xattr' binary.")