2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
33 import xml.etree.ElementTree
40 compat_etree_fromstring,
42 compat_html_entities_html5,
47 compat_socket_create_connection,
52 compat_urllib_parse_urlencode,
53 compat_urllib_parse_urlparse,
54 compat_urllib_parse_unquote_plus,
55 compat_urllib_request,
66 def register_socks_protocols():
67 # "Register" SOCKS protocols
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
75 # This is not clearly defined otherwise
76 compiled_regex_type = type(re.compile(''))
79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
89 ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'f4f', 'f4m', 'm3u8', 'smil')
108 # needed for sanitizing filenames in restricted mode
109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
126 '%Y-%m-%d %H:%M:%S.%f',
129 '%Y-%m-%dT%H:%M:%SZ',
130 '%Y-%m-%dT%H:%M:%S.%fZ',
131 '%Y-%m-%dT%H:%M:%S.%f0Z',
133 '%Y-%m-%dT%H:%M:%S.%f',
137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
138 DATE_FORMATS_DAY_FIRST.extend([
147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
148 DATE_FORMATS_MONTH_FIRST.extend([
157 def preferredencoding():
158 """Get preferred encoding.
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
164 pref = locale.getpreferredencoding()
172 def write_json_file(obj, fn):
173 """ Encode obj as JSON and write it to fn, atomically if possible """
175 fn = encodeFilename(fn)
176 if sys.version_info < (3, 0) and sys.platform != 'win32':
177 encoding = get_filesystem_encoding()
178 # os.path.basename returns a bytes object, but NamedTemporaryFile
179 # will fail if the filename contains non ascii characters unless we
180 # use a unicode object
181 path_basename = lambda f: os.path.basename(fn).decode(encoding)
182 # the same for os.path.dirname
183 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
185 path_basename = os.path.basename
186 path_dirname = os.path.dirname
190 'prefix': path_basename(fn) + '.',
191 'dir': path_dirname(fn),
195 # In Python 2.x, json.dump expects a bytestream.
196 # In Python 3.x, it writes to a character stream
197 if sys.version_info < (3, 0):
205 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
210 if sys.platform == 'win32':
211 # Need to remove existing file on Windows, else os.rename raises
212 # WindowsError or FileExistsError.
217 os.rename(tf.name, fn)
226 if sys.version_info >= (2, 7):
227 def find_xpath_attr(node, xpath, key, val=None):
228 """ Find the xpath xpath[@key=val] """
229 assert re.match(r'^[a-zA-Z_-]+$', key)
230 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
231 return node.find(expr)
233 def find_xpath_attr(node, xpath, key, val=None):
234 for f in node.findall(compat_xpath(xpath)):
235 if key not in f.attrib:
237 if val is None or f.attrib.get(key) == val:
241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
242 # the namespace parameter
245 def xpath_with_ns(path, ns_map):
246 components = [c.split(':') for c in path.split('/')]
250 replaced.append(c[0])
253 replaced.append('{%s}%s' % (ns_map[ns], tag))
254 return '/'.join(replaced)
257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
258 def _find_xpath(xpath):
259 return node.find(compat_xpath(xpath))
261 if isinstance(xpath, (str, compat_str)):
262 n = _find_xpath(xpath)
270 if default is not NO_DEFAULT:
273 name = xpath if name is None else name
274 raise ExtractorError('Could not find XML element %s' % name)
280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
281 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
282 if n is None or n == default:
285 if default is not NO_DEFAULT:
288 name = xpath if name is None else name
289 raise ExtractorError('Could not find XML element\'s text %s' % name)
295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
296 n = find_xpath_attr(node, xpath, key)
298 if default is not NO_DEFAULT:
301 name = '%s[@%s]' % (xpath, key) if name is None else name
302 raise ExtractorError('Could not find XML attribute %s' % name)
308 def get_element_by_id(id, html):
309 """Return the content of the tag with the specified ID in the passed HTML document"""
310 return get_element_by_attribute('id', id, html)
313 def get_element_by_class(class_name, html):
314 return get_element_by_attribute(
315 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
316 html, escape_value=False)
319 def get_element_by_attribute(attribute, value, html, escape_value=True):
320 """Return the content of the tag with the specified attribute in the passed HTML document"""
322 value = re.escape(value) if escape_value else value
324 m = re.search(r'''(?xs)
326 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
328 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
332 ''' % (re.escape(attribute), value), html)
336 res = m.group('content')
338 if res.startswith('"') or res.startswith("'"):
341 return unescapeHTML(res)
344 class HTMLAttributeParser(compat_HTMLParser):
345 """Trivial HTML parser to gather the attributes for a single element"""
348 compat_HTMLParser.__init__(self)
350 def handle_starttag(self, tag, attrs):
351 self.attrs = dict(attrs)
354 def extract_attributes(html_element):
355 """Given a string for an HTML element such as
357 a="foo" B="bar" c="&98;az" d=boz
358 empty= noval entity="&"
361 Decode and return a dictionary of attributes.
363 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
364 'empty': '', 'noval': None, 'entity': '&',
365 'sq': '"', 'dq': '\''
367 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
368 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
370 parser = HTMLAttributeParser()
371 parser.feed(html_element)
376 def clean_html(html):
377 """Clean an HTML snippet into a readable string"""
379 if html is None: # Convenience for sanitizing descriptions etc.
383 html = html.replace('\n', ' ')
384 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
385 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
387 html = re.sub('<.*?>', '', html)
388 # Replace html entities
389 html = unescapeHTML(html)
393 def sanitize_open(filename, open_mode):
394 """Try to open the given filename, and slightly tweak it if this fails.
396 Attempts to open the given filename. If this fails, it tries to change
397 the filename slightly, step by step, until it's either able to open it
398 or it fails and raises a final exception, like the standard open()
401 It returns the tuple (stream, definitive_file_name).
405 if sys.platform == 'win32':
407 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
408 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
409 stream = open(encodeFilename(filename), open_mode)
410 return (stream, filename)
411 except (IOError, OSError) as err:
412 if err.errno in (errno.EACCES,):
415 # In case of error, try to remove win32 forbidden chars
416 alt_filename = sanitize_path(filename)
417 if alt_filename == filename:
420 # An exception here should be caught in the caller
421 stream = open(encodeFilename(alt_filename), open_mode)
422 return (stream, alt_filename)
425 def timeconvert(timestr):
426 """Convert RFC 2822 defined time string into system timestamp"""
428 timetuple = email.utils.parsedate_tz(timestr)
429 if timetuple is not None:
430 timestamp = email.utils.mktime_tz(timetuple)
434 def sanitize_filename(s, restricted=False, is_id=False):
435 """Sanitizes a string so it could be used as part of a filename.
436 If restricted is set, use a stricter subset of allowed characters.
437 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
439 def replace_insane(char):
440 if restricted and char in ACCENT_CHARS:
441 return ACCENT_CHARS[char]
442 if char == '?' or ord(char) < 32 or ord(char) == 127:
445 return '' if restricted else '\''
447 return '_-' if restricted else ' -'
448 elif char in '\\/|*<>':
450 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
452 if restricted and ord(char) > 127:
457 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
458 result = ''.join(map(replace_insane, s))
460 while '__' in result:
461 result = result.replace('__', '_')
462 result = result.strip('_')
463 # Common case of "Foreign band name - English song title"
464 if restricted and result.startswith('-_'):
466 if result.startswith('-'):
467 result = '_' + result[len('-'):]
468 result = result.lstrip('.')
474 def sanitize_path(s):
475 """Sanitizes and normalizes path on Windows"""
476 if sys.platform != 'win32':
478 drive_or_unc, _ = os.path.splitdrive(s)
479 if sys.version_info < (2, 7) and not drive_or_unc:
480 drive_or_unc, _ = os.path.splitunc(s)
481 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
485 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
486 for path_part in norm_path]
488 sanitized_path.insert(0, drive_or_unc + os.path.sep)
489 return os.path.join(*sanitized_path)
492 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
493 # unwanted failures due to missing protocol
494 def sanitize_url(url):
495 return 'http:%s' % url if url.startswith('//') else url
498 def sanitized_Request(url, *args, **kwargs):
499 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
502 def orderedSet(iterable):
503 """ Remove all duplicates from the input iterable """
511 def _htmlentity_transform(entity_with_semicolon):
512 """Transforms an HTML entity to a character."""
513 entity = entity_with_semicolon[:-1]
515 # Known non-numeric HTML entity
516 if entity in compat_html_entities.name2codepoint:
517 return compat_chr(compat_html_entities.name2codepoint[entity])
519 # TODO: HTML5 allows entities without a semicolon. For example,
520 # 'Éric' should be decoded as 'Éric'.
521 if entity_with_semicolon in compat_html_entities_html5:
522 return compat_html_entities_html5[entity_with_semicolon]
524 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
526 numstr = mobj.group(1)
527 if numstr.startswith('x'):
529 numstr = '0%s' % numstr
532 # See https://github.com/rg3/youtube-dl/issues/7518
534 return compat_chr(int(numstr, base))
538 # Unknown entity in name, return its literal representation
539 return '&%s;' % entity
545 assert type(s) == compat_str
548 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
551 def get_subprocess_encoding():
552 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
553 # For subprocess calls, encode with locale encoding
554 # Refer to http://stackoverflow.com/a/9951851/35070
555 encoding = preferredencoding()
557 encoding = sys.getfilesystemencoding()
563 def encodeFilename(s, for_subprocess=False):
565 @param s The name of the file
568 assert type(s) == compat_str
570 # Python 3 has a Unicode API
571 if sys.version_info >= (3, 0):
574 # Pass '' directly to use Unicode APIs on Windows 2000 and up
575 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
576 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
577 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
580 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
581 if sys.platform.startswith('java'):
584 return s.encode(get_subprocess_encoding(), 'ignore')
587 def decodeFilename(b, for_subprocess=False):
589 if sys.version_info >= (3, 0):
592 if not isinstance(b, bytes):
595 return b.decode(get_subprocess_encoding(), 'ignore')
598 def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
607 def decodeArgument(b):
608 return decodeFilename(b, True)
611 def decodeOption(optval):
614 if isinstance(optval, bytes):
615 optval = optval.decode(preferredencoding())
617 assert isinstance(optval, compat_str)
621 def formatSeconds(secs):
623 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
625 return '%d:%02d' % (secs // 60, secs % 60)
630 def make_HTTPS_handler(params, **kwargs):
631 opts_no_check_certificate = params.get('nocheckcertificate', False)
632 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
633 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
634 if opts_no_check_certificate:
635 context.check_hostname = False
636 context.verify_mode = ssl.CERT_NONE
638 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
641 # (create_default_context present but HTTPSHandler has no context=)
644 if sys.version_info < (3, 2):
645 return YoutubeDLHTTPSHandler(params, **kwargs)
647 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
648 context.verify_mode = (ssl.CERT_NONE
649 if opts_no_check_certificate
650 else ssl.CERT_REQUIRED)
651 context.set_default_verify_paths()
652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
655 def bug_reports_message():
656 if ytdl_is_updateable():
657 update_cmd = 'type youtube-dl -U to update'
659 update_cmd = 'see https://yt-dl.org/update on how to update'
660 msg = '; please report this issue on https://yt-dl.org/bug .'
661 msg += ' Make sure you are using the latest version; %s.' % update_cmd
662 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
666 class ExtractorError(Exception):
667 """Error during info extraction."""
669 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
670 """ tb, if given, is the original traceback (so that it can be printed out).
671 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
674 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
676 if video_id is not None:
677 msg = video_id + ': ' + msg
679 msg += ' (caused by %r)' % cause
681 msg += bug_reports_message()
682 super(ExtractorError, self).__init__(msg)
685 self.exc_info = sys.exc_info() # preserve original exception
687 self.video_id = video_id
689 def format_traceback(self):
690 if self.traceback is None:
692 return ''.join(traceback.format_tb(self.traceback))
695 class UnsupportedError(ExtractorError):
696 def __init__(self, url):
697 super(UnsupportedError, self).__init__(
698 'Unsupported URL: %s' % url, expected=True)
702 class RegexNotFoundError(ExtractorError):
703 """Error when a regex didn't match"""
707 class DownloadError(Exception):
708 """Download Error exception.
710 This exception may be thrown by FileDownloader objects if they are not
711 configured to continue on errors. They will contain the appropriate
715 def __init__(self, msg, exc_info=None):
716 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
717 super(DownloadError, self).__init__(msg)
718 self.exc_info = exc_info
721 class SameFileError(Exception):
722 """Same File exception.
724 This exception will be thrown by FileDownloader objects if they detect
725 multiple files would have to be downloaded to the same file on disk.
730 class PostProcessingError(Exception):
731 """Post Processing exception.
733 This exception may be raised by PostProcessor's .run() method to
734 indicate an error in the postprocessing task.
737 def __init__(self, msg):
741 class MaxDownloadsReached(Exception):
742 """ --max-downloads limit has been reached. """
746 class UnavailableVideoError(Exception):
747 """Unavailable Format exception.
749 This exception will be thrown when a video is requested
750 in a format that is not available for that video.
755 class ContentTooShortError(Exception):
756 """Content Too Short exception.
758 This exception may be raised by FileDownloader objects when a file they
759 download is too small for what the server announced first, indicating
760 the connection was probably interrupted.
763 def __init__(self, downloaded, expected):
765 self.downloaded = downloaded
766 self.expected = expected
769 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
770 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
771 # expected HTTP responses to meet HTTP/1.0 or later (see also
772 # https://github.com/rg3/youtube-dl/issues/6727)
773 if sys.version_info < (3, 0):
774 kwargs[b'strict'] = True
775 hc = http_class(*args, **kwargs)
776 source_address = ydl_handler._params.get('source_address')
777 if source_address is not None:
778 sa = (source_address, 0)
779 if hasattr(hc, 'source_address'): # Python 2.7+
780 hc.source_address = sa
782 def _hc_connect(self, *args, **kwargs):
783 sock = compat_socket_create_connection(
784 (self.host, self.port), self.timeout, sa)
786 self.sock = ssl.wrap_socket(
787 sock, self.key_file, self.cert_file,
788 ssl_version=ssl.PROTOCOL_TLSv1)
791 hc.connect = functools.partial(_hc_connect, hc)
796 def handle_youtubedl_headers(headers):
797 filtered_headers = headers
799 if 'Youtubedl-no-compression' in filtered_headers:
800 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
801 del filtered_headers['Youtubedl-no-compression']
803 return filtered_headers
806 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
807 """Handler for HTTP requests and responses.
809 This class, when installed with an OpenerDirector, automatically adds
810 the standard headers to every HTTP request and handles gzipped and
811 deflated responses from web servers. If compression is to be avoided in
812 a particular request, the original request in the program code only has
813 to include the HTTP header "Youtubedl-no-compression", which will be
814 removed before making the real request.
816 Part of this code was copied from:
818 http://techknack.net/python-urllib2-handlers/
820 Andrew Rowls, the author of that code, agreed to release it to the
824 def __init__(self, params, *args, **kwargs):
825 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
826 self._params = params
828 def http_open(self, req):
829 conn_class = compat_http_client.HTTPConnection
831 socks_proxy = req.headers.get('Ytdl-socks-proxy')
833 conn_class = make_socks_conn_class(conn_class, socks_proxy)
834 del req.headers['Ytdl-socks-proxy']
836 return self.do_open(functools.partial(
837 _create_http_connection, self, conn_class, False),
843 return zlib.decompress(data, -zlib.MAX_WBITS)
845 return zlib.decompress(data)
848 def addinfourl_wrapper(stream, headers, url, code):
849 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
850 return compat_urllib_request.addinfourl(stream, headers, url, code)
851 ret = compat_urllib_request.addinfourl(stream, headers, url)
855 def http_request(self, req):
856 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
857 # always respected by websites, some tend to give out URLs with non percent-encoded
858 # non-ASCII characters (see telemb.py, ard.py [#3412])
859 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
860 # To work around aforementioned issue we will replace request's original URL with
861 # percent-encoded one
862 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
863 # the code of this workaround has been moved here from YoutubeDL.urlopen()
864 url = req.get_full_url()
865 url_escaped = escape_url(url)
867 # Substitute URL if any change after escaping
868 if url != url_escaped:
869 req = update_Request(req, url=url_escaped)
871 for h, v in std_headers.items():
872 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
873 # The dict keys are capitalized because of this bug by urllib
874 if h.capitalize() not in req.headers:
877 req.headers = handle_youtubedl_headers(req.headers)
879 if sys.version_info < (2, 7) and '#' in req.get_full_url():
880 # Python 2.6 is brain-dead when it comes to fragments
881 req._Request__original = req._Request__original.partition('#')[0]
882 req._Request__r_type = req._Request__r_type.partition('#')[0]
886 def http_response(self, req, resp):
889 if resp.headers.get('Content-encoding', '') == 'gzip':
890 content = resp.read()
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
893 uncompressed = io.BytesIO(gz.read())
894 except IOError as original_ioerror:
895 # There may be junk add the end of the file
896 # See http://stackoverflow.com/q/4928560/35070 for details
897 for i in range(1, 1024):
899 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
900 uncompressed = io.BytesIO(gz.read())
905 raise original_ioerror
906 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
907 resp.msg = old_resp.msg
908 del resp.headers['Content-encoding']
910 if resp.headers.get('Content-encoding', '') == 'deflate':
911 gz = io.BytesIO(self.deflate(resp.read()))
912 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
913 resp.msg = old_resp.msg
914 del resp.headers['Content-encoding']
915 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
916 # https://github.com/rg3/youtube-dl/issues/6457).
917 if 300 <= resp.code < 400:
918 location = resp.headers.get('Location')
920 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
921 if sys.version_info >= (3, 0):
922 location = location.encode('iso-8859-1').decode('utf-8')
924 location = location.decode('utf-8')
925 location_escaped = escape_url(location)
926 if location != location_escaped:
927 del resp.headers['Location']
928 if sys.version_info < (3, 0):
929 location_escaped = location_escaped.encode('utf-8')
930 resp.headers['Location'] = location_escaped
933 https_request = http_request
934 https_response = http_response
937 def make_socks_conn_class(base_class, socks_proxy):
938 assert issubclass(base_class, (
939 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
941 url_components = compat_urlparse.urlparse(socks_proxy)
942 if url_components.scheme.lower() == 'socks5':
943 socks_type = ProxyType.SOCKS5
944 elif url_components.scheme.lower() in ('socks', 'socks4'):
945 socks_type = ProxyType.SOCKS4
946 elif url_components.scheme.lower() == 'socks4a':
947 socks_type = ProxyType.SOCKS4A
949 def unquote_if_non_empty(s):
952 return compat_urllib_parse_unquote_plus(s)
956 url_components.hostname, url_components.port or 1080,
958 unquote_if_non_empty(url_components.username),
959 unquote_if_non_empty(url_components.password),
962 class SocksConnection(base_class):
964 self.sock = sockssocket()
965 self.sock.setproxy(*proxy_args)
966 if type(self.timeout) in (int, float):
967 self.sock.settimeout(self.timeout)
968 self.sock.connect((self.host, self.port))
970 if isinstance(self, compat_http_client.HTTPSConnection):
971 if hasattr(self, '_context'): # Python > 2.6
972 self.sock = self._context.wrap_socket(
973 self.sock, server_hostname=self.host)
975 self.sock = ssl.wrap_socket(self.sock)
977 return SocksConnection
980 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
981 def __init__(self, params, https_conn_class=None, *args, **kwargs):
982 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
983 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
984 self._params = params
986 def https_open(self, req):
988 conn_class = self._https_conn_class
990 if hasattr(self, '_context'): # python > 2.6
991 kwargs['context'] = self._context
992 if hasattr(self, '_check_hostname'): # python 3.x
993 kwargs['check_hostname'] = self._check_hostname
995 socks_proxy = req.headers.get('Ytdl-socks-proxy')
997 conn_class = make_socks_conn_class(conn_class, socks_proxy)
998 del req.headers['Ytdl-socks-proxy']
1000 return self.do_open(functools.partial(
1001 _create_http_connection, self, conn_class, True),
1005 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1006 def __init__(self, cookiejar=None):
1007 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1009 def http_response(self, request, response):
1010 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1011 # characters in Set-Cookie HTTP header of last response (see
1012 # https://github.com/rg3/youtube-dl/issues/6769).
1013 # In order to at least prevent crashing we will percent encode Set-Cookie
1014 # header before HTTPCookieProcessor starts processing it.
1015 # if sys.version_info < (3, 0) and response.headers:
1016 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1017 # set_cookie = response.headers.get(set_cookie_header)
1019 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1020 # if set_cookie != set_cookie_escaped:
1021 # del response.headers[set_cookie_header]
1022 # response.headers[set_cookie_header] = set_cookie_escaped
1023 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1025 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1026 https_response = http_response
1029 def extract_timezone(date_str):
1031 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1034 timezone = datetime.timedelta()
1036 date_str = date_str[:-len(m.group('tz'))]
1037 if not m.group('sign'):
1038 timezone = datetime.timedelta()
1040 sign = 1 if m.group('sign') == '+' else -1
1041 timezone = datetime.timedelta(
1042 hours=sign * int(m.group('hours')),
1043 minutes=sign * int(m.group('minutes')))
1044 return timezone, date_str
1047 def parse_iso8601(date_str, delimiter='T', timezone=None):
1048 """ Return a UNIX timestamp from the given date """
1050 if date_str is None:
1053 date_str = re.sub(r'\.[0-9]+', '', date_str)
1055 if timezone is None:
1056 timezone, date_str = extract_timezone(date_str)
1059 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1060 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1061 return calendar.timegm(dt.timetuple())
1066 def date_formats(day_first=True):
1067 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1070 def unified_strdate(date_str, day_first=True):
1071 """Return a string with the date in the format YYYYMMDD"""
1073 if date_str is None:
1077 date_str = date_str.replace(',', ' ')
1078 # Remove AM/PM + timezone
1079 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1080 _, date_str = extract_timezone(date_str)
1082 for expression in date_formats(day_first):
1084 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1087 if upload_date is None:
1088 timetuple = email.utils.parsedate_tz(date_str)
1091 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1094 if upload_date is not None:
1095 return compat_str(upload_date)
1098 def unified_timestamp(date_str, day_first=True):
1099 if date_str is None:
1102 date_str = date_str.replace(',', ' ')
1104 pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1105 timezone, date_str = extract_timezone(date_str)
1107 # Remove AM/PM + timezone
1108 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1110 for expression in date_formats(day_first):
1112 dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1113 return calendar.timegm(dt.timetuple())
1116 timetuple = email.utils.parsedate_tz(date_str)
1118 return calendar.timegm(timetuple.timetuple())
1121 def determine_ext(url, default_ext='unknown_video'):
1124 guess = url.partition('?')[0].rpartition('.')[2]
1125 if re.match(r'^[A-Za-z0-9]+$', guess):
1127 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1128 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1129 return guess.rstrip('/')
1134 def subtitles_filename(filename, sub_lang, sub_format):
1135 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1138 def date_from_str(date_str):
1140 Return a datetime object from a string in the format YYYYMMDD or
1141 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1142 today = datetime.date.today()
1143 if date_str in ('now', 'today'):
1145 if date_str == 'yesterday':
1146 return today - datetime.timedelta(days=1)
1147 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1148 if match is not None:
1149 sign = match.group('sign')
1150 time = int(match.group('time'))
1153 unit = match.group('unit')
1154 # A bad approximation?
1158 elif unit == 'year':
1162 delta = datetime.timedelta(**{unit: time})
1163 return today + delta
1164 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1167 def hyphenate_date(date_str):
1169 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1170 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1171 if match is not None:
1172 return '-'.join(match.groups())
1177 class DateRange(object):
1178 """Represents a time interval between two dates"""
1180 def __init__(self, start=None, end=None):
1181 """start and end must be strings in the format accepted by date"""
1182 if start is not None:
1183 self.start = date_from_str(start)
1185 self.start = datetime.datetime.min.date()
1187 self.end = date_from_str(end)
1189 self.end = datetime.datetime.max.date()
1190 if self.start > self.end:
1191 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1195 """Returns a range that only contains the given day"""
1196 return cls(day, day)
1198 def __contains__(self, date):
1199 """Check if the date is in the range"""
1200 if not isinstance(date, datetime.date):
1201 date = date_from_str(date)
1202 return self.start <= date <= self.end
1205 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1208 def platform_name():
1209 """ Returns the platform name as a compat_str """
1210 res = platform.platform()
1211 if isinstance(res, bytes):
1212 res = res.decode(preferredencoding())
1214 assert isinstance(res, compat_str)
1218 def _windows_write_string(s, out):
1219 """ Returns True if the string was written using special methods,
1220 False if it has yet to be written out."""
1221 # Adapted from http://stackoverflow.com/a/3259271/35070
1224 import ctypes.wintypes
1232 fileno = out.fileno()
1233 except AttributeError:
1234 # If the output stream doesn't have a fileno, it's virtual
1236 except io.UnsupportedOperation:
1237 # Some strange Windows pseudo files?
1239 if fileno not in WIN_OUTPUT_IDS:
1242 GetStdHandle = ctypes.WINFUNCTYPE(
1243 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1244 (b'GetStdHandle', ctypes.windll.kernel32))
1245 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1247 WriteConsoleW = ctypes.WINFUNCTYPE(
1248 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1249 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1250 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1251 written = ctypes.wintypes.DWORD(0)
1253 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1254 FILE_TYPE_CHAR = 0x0002
1255 FILE_TYPE_REMOTE = 0x8000
1256 GetConsoleMode = ctypes.WINFUNCTYPE(
1257 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1258 ctypes.POINTER(ctypes.wintypes.DWORD))(
1259 (b'GetConsoleMode', ctypes.windll.kernel32))
1260 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1262 def not_a_console(handle):
1263 if handle == INVALID_HANDLE_VALUE or handle is None:
1265 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1266 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1268 if not_a_console(h):
1271 def next_nonbmp_pos(s):
1273 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1274 except StopIteration:
1278 count = min(next_nonbmp_pos(s), 1024)
1280 ret = WriteConsoleW(
1281 h, s, count if count else 2, ctypes.byref(written), None)
1283 raise OSError('Failed to write string')
1284 if not count: # We just wrote a non-BMP character
1285 assert written.value == 2
1288 assert written.value > 0
1289 s = s[written.value:]
1293 def write_string(s, out=None, encoding=None):
1296 assert type(s) == compat_str
1298 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1299 if _windows_write_string(s, out):
1302 if ('b' in getattr(out, 'mode', '') or
1303 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1304 byt = s.encode(encoding or preferredencoding(), 'ignore')
1306 elif hasattr(out, 'buffer'):
1307 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1308 byt = s.encode(enc, 'ignore')
1309 out.buffer.write(byt)
1315 def bytes_to_intlist(bs):
1318 if isinstance(bs[0], int): # Python 3
1321 return [ord(c) for c in bs]
1324 def intlist_to_bytes(xs):
1327 return compat_struct_pack('%dB' % len(xs), *xs)
1330 # Cross-platform file locking
1331 if sys.platform == 'win32':
1332 import ctypes.wintypes
1335 class OVERLAPPED(ctypes.Structure):
1337 ('Internal', ctypes.wintypes.LPVOID),
1338 ('InternalHigh', ctypes.wintypes.LPVOID),
1339 ('Offset', ctypes.wintypes.DWORD),
1340 ('OffsetHigh', ctypes.wintypes.DWORD),
1341 ('hEvent', ctypes.wintypes.HANDLE),
1344 kernel32 = ctypes.windll.kernel32
1345 LockFileEx = kernel32.LockFileEx
1346 LockFileEx.argtypes = [
1347 ctypes.wintypes.HANDLE, # hFile
1348 ctypes.wintypes.DWORD, # dwFlags
1349 ctypes.wintypes.DWORD, # dwReserved
1350 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1352 ctypes.POINTER(OVERLAPPED) # Overlapped
1354 LockFileEx.restype = ctypes.wintypes.BOOL
1355 UnlockFileEx = kernel32.UnlockFileEx
1356 UnlockFileEx.argtypes = [
1357 ctypes.wintypes.HANDLE, # hFile
1358 ctypes.wintypes.DWORD, # dwReserved
1359 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1360 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1361 ctypes.POINTER(OVERLAPPED) # Overlapped
1363 UnlockFileEx.restype = ctypes.wintypes.BOOL
1364 whole_low = 0xffffffff
1365 whole_high = 0x7fffffff
1367 def _lock_file(f, exclusive):
1368 overlapped = OVERLAPPED()
1369 overlapped.Offset = 0
1370 overlapped.OffsetHigh = 0
1371 overlapped.hEvent = 0
1372 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1373 handle = msvcrt.get_osfhandle(f.fileno())
1374 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1375 whole_low, whole_high, f._lock_file_overlapped_p):
1376 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1378 def _unlock_file(f):
1379 assert f._lock_file_overlapped_p
1380 handle = msvcrt.get_osfhandle(f.fileno())
1381 if not UnlockFileEx(handle, 0,
1382 whole_low, whole_high, f._lock_file_overlapped_p):
1383 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1386 # Some platforms, such as Jython, is missing fcntl
1390 def _lock_file(f, exclusive):
1391 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1393 def _unlock_file(f):
1394 fcntl.flock(f, fcntl.LOCK_UN)
1396 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1398 def _lock_file(f, exclusive):
1399 raise IOError(UNSUPPORTED_MSG)
1401 def _unlock_file(f):
1402 raise IOError(UNSUPPORTED_MSG)
1405 class locked_file(object):
1406 def __init__(self, filename, mode, encoding=None):
1407 assert mode in ['r', 'a', 'w']
1408 self.f = io.open(filename, mode, encoding=encoding)
1411 def __enter__(self):
1412 exclusive = self.mode != 'r'
1414 _lock_file(self.f, exclusive)
1420 def __exit__(self, etype, value, traceback):
1422 _unlock_file(self.f)
1429 def write(self, *args):
1430 return self.f.write(*args)
1432 def read(self, *args):
1433 return self.f.read(*args)
1436 def get_filesystem_encoding():
1437 encoding = sys.getfilesystemencoding()
1438 return encoding if encoding is not None else 'utf-8'
1441 def shell_quote(args):
1443 encoding = get_filesystem_encoding()
1445 if isinstance(a, bytes):
1446 # We may get a filename encoded with 'encodeFilename'
1447 a = a.decode(encoding)
1448 quoted_args.append(pipes.quote(a))
1449 return ' '.join(quoted_args)
1452 def smuggle_url(url, data):
1453 """ Pass additional data in a URL for internal use. """
1455 url, idata = unsmuggle_url(url, {})
1457 sdata = compat_urllib_parse_urlencode(
1458 {'__youtubedl_smuggle': json.dumps(data)})
1459 return url + '#' + sdata
1462 def unsmuggle_url(smug_url, default=None):
1463 if '#__youtubedl_smuggle' not in smug_url:
1464 return smug_url, default
1465 url, _, sdata = smug_url.rpartition('#')
1466 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1467 data = json.loads(jsond)
1471 def format_bytes(bytes):
1474 if type(bytes) is str:
1475 bytes = float(bytes)
1479 exponent = int(math.log(bytes, 1024.0))
1480 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1481 converted = float(bytes) / float(1024 ** exponent)
1482 return '%.2f%s' % (converted, suffix)
1485 def lookup_unit_table(unit_table, s):
1486 units_re = '|'.join(re.escape(u) for u in unit_table)
1488 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1491 num_str = m.group('num').replace(',', '.')
1492 mult = unit_table[m.group('unit')]
1493 return int(float(num_str) * mult)
1496 def parse_filesize(s):
1500 # The lower-case forms are of course incorrect and unofficial,
1501 # but we support those too
1539 return lookup_unit_table(_UNIT_TABLE, s)
1548 if re.match(r'^[\d,.]+$', s):
1549 return str_to_int(s)
1560 return lookup_unit_table(_UNIT_TABLE, s)
1563 def month_by_name(name):
1564 """ Return the number of a month by (locale-independently) English name """
1567 return ENGLISH_MONTH_NAMES.index(name) + 1
1572 def month_by_abbreviation(abbrev):
1573 """ Return the number of a month by (locale-independently) English
1577 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1582 def fix_xml_ampersands(xml_str):
1583 """Replace all the '&' by '&' in XML"""
1585 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1590 def setproctitle(title):
1591 assert isinstance(title, compat_str)
1593 # ctypes in Jython is not complete
1594 # http://bugs.jython.org/issue2148
1595 if sys.platform.startswith('java'):
1599 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1602 title_bytes = title.encode('utf-8')
1603 buf = ctypes.create_string_buffer(len(title_bytes))
1604 buf.value = title_bytes
1606 libc.prctl(15, buf, 0, 0, 0)
1607 except AttributeError:
1608 return # Strange libc, just skip this
1611 def remove_start(s, start):
1612 return s[len(start):] if s is not None and s.startswith(start) else s
1615 def remove_end(s, end):
1616 return s[:-len(end)] if s is not None and s.endswith(end) else s
1619 def remove_quotes(s):
1620 if s is None or len(s) < 2:
1622 for quote in ('"', "'", ):
1623 if s[0] == quote and s[-1] == quote:
1628 def url_basename(url):
1629 path = compat_urlparse.urlparse(url).path
1630 return path.strip('/').split('/')[-1]
1633 class HEADRequest(compat_urllib_request.Request):
1634 def get_method(self):
1638 class PUTRequest(compat_urllib_request.Request):
1639 def get_method(self):
1643 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1646 v = getattr(v, get_attr, None)
1652 return int(v) * invscale // scale
1657 def str_or_none(v, default=None):
1658 return default if v is None else compat_str(v)
1661 def str_to_int(int_str):
1662 """ A more relaxed version of int_or_none """
1665 int_str = re.sub(r'[,\.\+]', '', int_str)
1669 def float_or_none(v, scale=1, invscale=1, default=None):
1673 return float(v) * invscale / scale
1678 def strip_or_none(v):
1679 return None if v is None else v.strip()
1682 def parse_duration(s):
1683 if not isinstance(s, compat_basestring):
1688 days, hours, mins, secs, ms = [None] * 5
1689 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1691 days, hours, mins, secs, ms = m.groups()
1696 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1699 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1702 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1705 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1708 days, hours, mins, secs, ms = m.groups()
1710 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1712 hours, mins = m.groups()
1718 duration += float(secs)
1720 duration += float(mins) * 60
1722 duration += float(hours) * 60 * 60
1724 duration += float(days) * 24 * 60 * 60
1726 duration += float(ms)
1730 def prepend_extension(filename, ext, expected_real_ext=None):
1731 name, real_ext = os.path.splitext(filename)
1733 '{0}.{1}{2}'.format(name, ext, real_ext)
1734 if not expected_real_ext or real_ext[1:] == expected_real_ext
1735 else '{0}.{1}'.format(filename, ext))
1738 def replace_extension(filename, ext, expected_real_ext=None):
1739 name, real_ext = os.path.splitext(filename)
1740 return '{0}.{1}'.format(
1741 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1745 def check_executable(exe, args=[]):
1746 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1747 args can be a list of arguments for a short output (like -version) """
1749 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1755 def get_exe_version(exe, args=['--version'],
1756 version_re=None, unrecognized='present'):
1757 """ Returns the version of the specified executable,
1758 or False if the executable is not present """
1760 out, _ = subprocess.Popen(
1761 [encodeArgument(exe)] + args,
1762 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1765 if isinstance(out, bytes): # Python 2.x
1766 out = out.decode('ascii', 'ignore')
1767 return detect_exe_version(out, version_re, unrecognized)
1770 def detect_exe_version(output, version_re=None, unrecognized='present'):
1771 assert isinstance(output, compat_str)
1772 if version_re is None:
1773 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1774 m = re.search(version_re, output)
1781 class PagedList(object):
1783 # This is only useful for tests
1784 return len(self.getslice())
1787 class OnDemandPagedList(PagedList):
1788 def __init__(self, pagefunc, pagesize, use_cache=False):
1789 self._pagefunc = pagefunc
1790 self._pagesize = pagesize
1791 self._use_cache = use_cache
1795 def getslice(self, start=0, end=None):
1797 for pagenum in itertools.count(start // self._pagesize):
1798 firstid = pagenum * self._pagesize
1799 nextfirstid = pagenum * self._pagesize + self._pagesize
1800 if start >= nextfirstid:
1805 page_results = self._cache.get(pagenum)
1806 if page_results is None:
1807 page_results = list(self._pagefunc(pagenum))
1809 self._cache[pagenum] = page_results
1812 start % self._pagesize
1813 if firstid <= start < nextfirstid
1817 ((end - 1) % self._pagesize) + 1
1818 if (end is not None and firstid <= end <= nextfirstid)
1821 if startv != 0 or endv is not None:
1822 page_results = page_results[startv:endv]
1823 res.extend(page_results)
1825 # A little optimization - if current page is not "full", ie. does
1826 # not contain page_size videos then we can assume that this page
1827 # is the last one - there are no more ids on further pages -
1828 # i.e. no need to query again.
1829 if len(page_results) + startv < self._pagesize:
1832 # If we got the whole page, but the next page is not interesting,
1833 # break out early as well
1834 if end == nextfirstid:
1839 class InAdvancePagedList(PagedList):
1840 def __init__(self, pagefunc, pagecount, pagesize):
1841 self._pagefunc = pagefunc
1842 self._pagecount = pagecount
1843 self._pagesize = pagesize
1845 def getslice(self, start=0, end=None):
1847 start_page = start // self._pagesize
1849 self._pagecount if end is None else (end // self._pagesize + 1))
1850 skip_elems = start - start_page * self._pagesize
1851 only_more = None if end is None else end - start
1852 for pagenum in range(start_page, end_page):
1853 page = list(self._pagefunc(pagenum))
1855 page = page[skip_elems:]
1857 if only_more is not None:
1858 if len(page) < only_more:
1859 only_more -= len(page)
1861 page = page[:only_more]
1868 def uppercase_escape(s):
1869 unicode_escape = codecs.getdecoder('unicode_escape')
1871 r'\\U[0-9a-fA-F]{8}',
1872 lambda m: unicode_escape(m.group(0))[0],
1876 def lowercase_escape(s):
1877 unicode_escape = codecs.getdecoder('unicode_escape')
1879 r'\\u[0-9a-fA-F]{4}',
1880 lambda m: unicode_escape(m.group(0))[0],
1884 def escape_rfc3986(s):
1885 """Escape non-ASCII characters as suggested by RFC 3986"""
1886 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1887 s = s.encode('utf-8')
1888 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1891 def escape_url(url):
1892 """Escape URL as suggested by RFC 3986"""
1893 url_parsed = compat_urllib_parse_urlparse(url)
1894 return url_parsed._replace(
1895 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1896 path=escape_rfc3986(url_parsed.path),
1897 params=escape_rfc3986(url_parsed.params),
1898 query=escape_rfc3986(url_parsed.query),
1899 fragment=escape_rfc3986(url_parsed.fragment)
1903 def read_batch_urls(batch_fd):
1905 if not isinstance(url, compat_str):
1906 url = url.decode('utf-8', 'replace')
1907 BOM_UTF8 = '\xef\xbb\xbf'
1908 if url.startswith(BOM_UTF8):
1909 url = url[len(BOM_UTF8):]
1911 if url.startswith(('#', ';', ']')):
1915 with contextlib.closing(batch_fd) as fd:
1916 return [url for url in map(fixup, fd) if url]
1919 def urlencode_postdata(*args, **kargs):
1920 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1923 def update_url_query(url, query):
1926 parsed_url = compat_urlparse.urlparse(url)
1927 qs = compat_parse_qs(parsed_url.query)
1929 return compat_urlparse.urlunparse(parsed_url._replace(
1930 query=compat_urllib_parse_urlencode(qs, True)))
1933 def update_Request(req, url=None, data=None, headers={}, query={}):
1934 req_headers = req.headers.copy()
1935 req_headers.update(headers)
1936 req_data = data or req.data
1937 req_url = update_url_query(url or req.get_full_url(), query)
1938 req_get_method = req.get_method()
1939 if req_get_method == 'HEAD':
1940 req_type = HEADRequest
1941 elif req_get_method == 'PUT':
1942 req_type = PUTRequest
1944 req_type = compat_urllib_request.Request
1946 req_url, data=req_data, headers=req_headers,
1947 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1948 if hasattr(req, 'timeout'):
1949 new_req.timeout = req.timeout
1953 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1954 if isinstance(key_or_keys, (list, tuple)):
1955 for key in key_or_keys:
1956 if key not in d or d[key] is None or skip_false_values and not d[key]:
1960 return d.get(key_or_keys, default)
1963 def try_get(src, getter, expected_type=None):
1966 except (AttributeError, KeyError, TypeError, IndexError):
1969 if expected_type is None or isinstance(v, expected_type):
1973 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1974 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1986 def parse_age_limit(s):
1989 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1990 return int(m.group('age')) if m else US_RATINGS.get(s)
1993 def strip_jsonp(code):
1995 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1998 def js_to_json(code):
2001 if v in ('true', 'false', 'null'):
2003 elif v.startswith('/*') or v == ',':
2006 if v[0] in ("'", '"'):
2007 v = re.sub(r'(?s)\\.|"', lambda m: {
2012 }.get(m.group(0), m.group(0)), v[1:-1])
2015 (r'^0[xX][0-9a-fA-F]+', 16),
2019 for regex, base in INTEGER_TABLE:
2020 im = re.match(regex, v)
2022 i = int(im.group(0), base)
2023 return '"%d":' % i if v.endswith(':') else '%d' % i
2027 return re.sub(r'''(?sx)
2028 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2029 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2030 /\*.*?\*/|,(?=\s*[\]}])|
2031 [a-zA-Z_][.a-zA-Z_0-9]*|
2032 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2037 def qualities(quality_ids):
2038 """ Get a numeric quality value out of a list of possible values """
2041 return quality_ids.index(qid)
2047 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2050 def limit_length(s, length):
2051 """ Add ellipses to overly long strings """
2056 return s[:length - len(ELLIPSES)] + ELLIPSES
2060 def version_tuple(v):
2061 return tuple(int(e) for e in re.split(r'[-.]', v))
2064 def is_outdated_version(version, limit, assume_new=True):
2066 return not assume_new
2068 return version_tuple(version) < version_tuple(limit)
2070 return not assume_new
2073 def ytdl_is_updateable():
2074 """ Returns if youtube-dl can be updated with -U """
2075 from zipimport import zipimporter
2077 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2080 def args_to_str(args):
2081 # Get a short string representation for a subprocess command
2082 return ' '.join(compat_shlex_quote(a) for a in args)
2085 def error_to_compat_str(err):
2087 # On python 2 error byte string must be decoded with proper
2088 # encoding rather than ascii
2089 if sys.version_info[0] < 3:
2090 err_str = err_str.decode(preferredencoding())
2094 def mimetype2ext(mt):
2100 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2101 # it's the most popular one
2102 'audio/mpeg': 'mp3',
2107 _, _, res = mt.rpartition('/')
2112 'smptett+xml': 'tt',
2118 'x-mp4-fragmented': 'mp4',
2121 'x-mpegurl': 'm3u8',
2122 'vnd.apple.mpegurl': 'm3u8',
2129 def urlhandle_detect_ext(url_handle):
2130 getheader = url_handle.headers.get
2132 cd = getheader('Content-Disposition')
2134 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2136 e = determine_ext(m.group('filename'), default_ext=None)
2140 return mimetype2ext(getheader('Content-Type'))
2143 def encode_data_uri(data, mime_type):
2144 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2147 def age_restricted(content_limit, age_limit):
2148 """ Returns True iff the content should be blocked """
2150 if age_limit is None: # No limit set
2152 if content_limit is None:
2153 return False # Content available for everyone
2154 return age_limit < content_limit
2157 def is_html(first_bytes):
2158 """ Detect whether a file contains HTML by examining its first bytes. """
2161 (b'\xef\xbb\xbf', 'utf-8'),
2162 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2163 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2164 (b'\xff\xfe', 'utf-16-le'),
2165 (b'\xfe\xff', 'utf-16-be'),
2167 for bom, enc in BOMS:
2168 if first_bytes.startswith(bom):
2169 s = first_bytes[len(bom):].decode(enc, 'replace')
2172 s = first_bytes.decode('utf-8', 'replace')
2174 return re.match(r'^\s*<', s)
2177 def determine_protocol(info_dict):
2178 protocol = info_dict.get('protocol')
2179 if protocol is not None:
2182 url = info_dict['url']
2183 if url.startswith('rtmp'):
2185 elif url.startswith('mms'):
2187 elif url.startswith('rtsp'):
2190 ext = determine_ext(url)
2196 return compat_urllib_parse_urlparse(url).scheme
2199 def render_table(header_row, data):
2200 """ Render a list of rows, each as a list of values """
2201 table = [header_row] + data
2202 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2203 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2204 return '\n'.join(format_str % tuple(row) for row in table)
2207 def _match_one(filter_part, dct):
2208 COMPARISON_OPERATORS = {
2216 operator_rex = re.compile(r'''(?x)\s*
2218 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2220 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2221 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2224 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2225 m = operator_rex.search(filter_part)
2227 op = COMPARISON_OPERATORS[m.group('op')]
2228 if m.group('strval') is not None:
2229 if m.group('op') not in ('=', '!='):
2231 'Operator %s does not support string values!' % m.group('op'))
2232 comparison_value = m.group('strval')
2235 comparison_value = int(m.group('intval'))
2237 comparison_value = parse_filesize(m.group('intval'))
2238 if comparison_value is None:
2239 comparison_value = parse_filesize(m.group('intval') + 'B')
2240 if comparison_value is None:
2242 'Invalid integer value %r in filter part %r' % (
2243 m.group('intval'), filter_part))
2244 actual_value = dct.get(m.group('key'))
2245 if actual_value is None:
2246 return m.group('none_inclusive')
2247 return op(actual_value, comparison_value)
2250 '': lambda v: v is not None,
2251 '!': lambda v: v is None,
2253 operator_rex = re.compile(r'''(?x)\s*
2254 (?P<op>%s)\s*(?P<key>[a-z_]+)
2256 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2257 m = operator_rex.search(filter_part)
2259 op = UNARY_OPERATORS[m.group('op')]
2260 actual_value = dct.get(m.group('key'))
2261 return op(actual_value)
2263 raise ValueError('Invalid filter part %r' % filter_part)
2266 def match_str(filter_str, dct):
2267 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2270 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2273 def match_filter_func(filter_str):
2274 def _match_func(info_dict):
2275 if match_str(filter_str, info_dict):
2278 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2279 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2283 def parse_dfxp_time_expr(time_expr):
2287 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2289 return float(mobj.group('time_offset'))
2291 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2293 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2296 def srt_subtitles_timecode(seconds):
2297 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2300 def dfxp2srt(dfxp_data):
2301 _x = functools.partial(xpath_with_ns, ns_map={
2302 'ttml': 'http://www.w3.org/ns/ttml',
2303 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2304 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2307 class TTMLPElementParser(object):
2310 def start(self, tag, attrib):
2311 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2317 def data(self, data):
2321 return self.out.strip()
2323 def parse_node(node):
2324 target = TTMLPElementParser()
2325 parser = xml.etree.ElementTree.XMLParser(target=target)
2326 parser.feed(xml.etree.ElementTree.tostring(node))
2327 return parser.close()
2329 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2331 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2334 raise ValueError('Invalid dfxp/TTML subtitle')
2336 for para, index in zip(paras, itertools.count(1)):
2337 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2338 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2339 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2340 if begin_time is None:
2345 end_time = begin_time + dur
2346 out.append('%d\n%s --> %s\n%s\n\n' % (
2348 srt_subtitles_timecode(begin_time),
2349 srt_subtitles_timecode(end_time),
2355 def cli_option(params, command_option, param):
2356 param = params.get(param)
2357 return [command_option, param] if param is not None else []
2360 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2361 param = params.get(param)
2362 assert isinstance(param, bool)
2364 return [command_option + separator + (true_value if param else false_value)]
2365 return [command_option, true_value if param else false_value]
2368 def cli_valueless_option(params, command_option, param, expected_value=True):
2369 param = params.get(param)
2370 return [command_option] if param == expected_value else []
2373 def cli_configuration_args(params, param, default=[]):
2374 ex_args = params.get(param)
2377 assert isinstance(ex_args, list)
2381 class ISO639Utils(object):
2382 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2571 def short2long(cls, code):
2572 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2573 return cls._lang_map.get(code[:2])
2576 def long2short(cls, code):
2577 """Convert language code from ISO 639-2/T to ISO 639-1"""
2578 for short_name, long_name in cls._lang_map.items():
2579 if long_name == code:
2583 class ISO3166Utils(object):
2584 # From http://data.okfn.org/data/core/country-list
2586 'AF': 'Afghanistan',
2587 'AX': 'Åland Islands',
2590 'AS': 'American Samoa',
2595 'AG': 'Antigua and Barbuda',
2612 'BO': 'Bolivia, Plurinational State of',
2613 'BQ': 'Bonaire, Sint Eustatius and Saba',
2614 'BA': 'Bosnia and Herzegovina',
2616 'BV': 'Bouvet Island',
2618 'IO': 'British Indian Ocean Territory',
2619 'BN': 'Brunei Darussalam',
2621 'BF': 'Burkina Faso',
2627 'KY': 'Cayman Islands',
2628 'CF': 'Central African Republic',
2632 'CX': 'Christmas Island',
2633 'CC': 'Cocos (Keeling) Islands',
2637 'CD': 'Congo, the Democratic Republic of the',
2638 'CK': 'Cook Islands',
2640 'CI': 'Côte d\'Ivoire',
2645 'CZ': 'Czech Republic',
2649 'DO': 'Dominican Republic',
2652 'SV': 'El Salvador',
2653 'GQ': 'Equatorial Guinea',
2657 'FK': 'Falkland Islands (Malvinas)',
2658 'FO': 'Faroe Islands',
2662 'GF': 'French Guiana',
2663 'PF': 'French Polynesia',
2664 'TF': 'French Southern Territories',
2679 'GW': 'Guinea-Bissau',
2682 'HM': 'Heard Island and McDonald Islands',
2683 'VA': 'Holy See (Vatican City State)',
2690 'IR': 'Iran, Islamic Republic of',
2693 'IM': 'Isle of Man',
2703 'KP': 'Korea, Democratic People\'s Republic of',
2704 'KR': 'Korea, Republic of',
2707 'LA': 'Lao People\'s Democratic Republic',
2713 'LI': 'Liechtenstein',
2717 'MK': 'Macedonia, the Former Yugoslav Republic of',
2724 'MH': 'Marshall Islands',
2730 'FM': 'Micronesia, Federated States of',
2731 'MD': 'Moldova, Republic of',
2742 'NL': 'Netherlands',
2743 'NC': 'New Caledonia',
2744 'NZ': 'New Zealand',
2749 'NF': 'Norfolk Island',
2750 'MP': 'Northern Mariana Islands',
2755 'PS': 'Palestine, State of',
2757 'PG': 'Papua New Guinea',
2760 'PH': 'Philippines',
2764 'PR': 'Puerto Rico',
2768 'RU': 'Russian Federation',
2770 'BL': 'Saint Barthélemy',
2771 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2772 'KN': 'Saint Kitts and Nevis',
2773 'LC': 'Saint Lucia',
2774 'MF': 'Saint Martin (French part)',
2775 'PM': 'Saint Pierre and Miquelon',
2776 'VC': 'Saint Vincent and the Grenadines',
2779 'ST': 'Sao Tome and Principe',
2780 'SA': 'Saudi Arabia',
2784 'SL': 'Sierra Leone',
2786 'SX': 'Sint Maarten (Dutch part)',
2789 'SB': 'Solomon Islands',
2791 'ZA': 'South Africa',
2792 'GS': 'South Georgia and the South Sandwich Islands',
2793 'SS': 'South Sudan',
2798 'SJ': 'Svalbard and Jan Mayen',
2801 'CH': 'Switzerland',
2802 'SY': 'Syrian Arab Republic',
2803 'TW': 'Taiwan, Province of China',
2805 'TZ': 'Tanzania, United Republic of',
2807 'TL': 'Timor-Leste',
2811 'TT': 'Trinidad and Tobago',
2814 'TM': 'Turkmenistan',
2815 'TC': 'Turks and Caicos Islands',
2819 'AE': 'United Arab Emirates',
2820 'GB': 'United Kingdom',
2821 'US': 'United States',
2822 'UM': 'United States Minor Outlying Islands',
2826 'VE': 'Venezuela, Bolivarian Republic of',
2828 'VG': 'Virgin Islands, British',
2829 'VI': 'Virgin Islands, U.S.',
2830 'WF': 'Wallis and Futuna',
2831 'EH': 'Western Sahara',
2838 def short2full(cls, code):
2839 """Convert an ISO 3166-2 country code to the corresponding full name"""
2840 return cls._country_map.get(code.upper())
2843 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2844 def __init__(self, proxies=None):
2845 # Set default handlers
2846 for type in ('http', 'https'):
2847 setattr(self, '%s_open' % type,
2848 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2849 meth(r, proxy, type))
2850 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2852 def proxy_open(self, req, proxy, type):
2853 req_proxy = req.headers.get('Ytdl-request-proxy')
2854 if req_proxy is not None:
2856 del req.headers['Ytdl-request-proxy']
2858 if proxy == '__noproxy__':
2859 return None # No Proxy
2860 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2861 req.add_header('Ytdl-socks-proxy', proxy)
2862 # youtube-dl's http/https handlers do wrapping the socket with socks
2864 return compat_urllib_request.ProxyHandler.proxy_open(
2865 self, req, proxy, type)
2868 def ohdave_rsa_encrypt(data, exponent, modulus):
2870 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2873 data: data to encrypt, bytes-like object
2874 exponent, modulus: parameter e and N of RSA algorithm, both integer
2875 Output: hex string of encrypted data
2877 Limitation: supports one block encryption only
2880 payload = int(binascii.hexlify(data[::-1]), 16)
2881 encrypted = pow(payload, exponent, modulus)
2882 return '%x' % encrypted
2885 def encode_base_n(num, n, table=None):
2886 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2888 table = FULL_TABLE[:n]
2891 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2898 ret = table[num % n] + ret
2903 def decode_packed_codes(code):
2905 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2907 obfucasted_code, base, count, symbols = mobj.groups()
2910 symbols = symbols.split('|')
2915 base_n_count = encode_base_n(count, base)
2916 symbol_table[base_n_count] = symbols[count] or base_n_count
2919 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2923 def parse_m3u8_attributes(attrib):
2925 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2926 if val.startswith('"'):
2932 def urshift(val, n):
2933 return val >> n if val >= 0 else (val + 0x100000000) >> n