2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
41 compat_etree_fromstring,
46 compat_socket_create_connection,
50 compat_urllib_parse_urlencode,
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
73 ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
90 'f4f', 'f4m', 'm3u8', 'smil')
93 def preferredencoding():
94 """Get preferred encoding.
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
100 pref = locale.getpreferredencoding()
108 def write_json_file(obj, fn):
109 """ Encode obj as JSON and write it to fn, atomically if possible """
111 fn = encodeFilename(fn)
112 if sys.version_info < (3, 0) and sys.platform != 'win32':
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
153 os.rename(tf.name, fn)
162 if sys.version_info >= (2, 7):
163 def find_xpath_attr(node, xpath, key, val=None):
164 """ Find the xpath xpath[@key=val] """
165 assert re.match(r'^[a-zA-Z_-]+$', key)
166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
167 return node.find(expr)
169 def find_xpath_attr(node, xpath, key, val=None):
170 for f in node.findall(compat_xpath(xpath)):
171 if key not in f.attrib:
173 if val is None or f.attrib.get(key) == val:
177 # On python2.6 the xml.etree.ElementTree.Element methods don't support
178 # the namespace parameter
181 def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
186 replaced.append(c[0])
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
193 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
194 def _find_xpath(xpath):
195 return node.find(compat_xpath(xpath))
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
206 if default is not NO_DEFAULT:
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
216 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
221 if default is not NO_DEFAULT:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
231 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
234 if default is not NO_DEFAULT:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
244 def get_element_by_id(id, html):
245 """Return the content of the tag with the specified ID in the passed HTML document"""
246 return get_element_by_attribute('id', id, html)
249 def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
252 m = re.search(r'''(?xs)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
260 ''' % (re.escape(attribute), re.escape(value)), html)
264 res = m.group('content')
266 if res.startswith('"') or res.startswith("'"):
269 return unescapeHTML(res)
272 class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
276 compat_HTMLParser.__init__(self)
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
282 def extract_attributes(html_element):
283 """Given a string for an HTML element such as
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&"
289 Decode and return a dictionary of attributes.
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
304 def clean_html(html):
305 """Clean an HTML snippet into a readable string"""
307 if html is None: # Convenience for sanitizing descriptions etc.
311 html = html.replace('\n', ' ')
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
321 def sanitize_open(filename, open_mode):
322 """Try to open the given filename, and slightly tweak it if this fails.
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
329 It returns the tuple (stream, definitive_file_name).
333 if sys.platform == 'win32':
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
340 if err.errno in (errno.EACCES,):
343 # In case of error, try to remove win32 forbidden chars
344 alt_filename = sanitize_path(filename)
345 if alt_filename == filename:
348 # An exception here should be caught in the caller
349 stream = open(encodeFilename(alt_filename), open_mode)
350 return (stream, alt_filename)
353 def timeconvert(timestr):
354 """Convert RFC 2822 defined time string into system timestamp"""
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
362 def sanitize_filename(s, restricted=False, is_id=False):
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
367 def replace_insane(char):
368 accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
369 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
370 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
371 if restricted and char in accents:
373 if char == '?' or ord(char) < 32 or ord(char) == 127:
376 return '' if restricted else '\''
378 return '_-' if restricted else ' -'
379 elif char in '\\/|*<>':
381 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
383 if restricted and ord(char) > 127:
388 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
389 result = ''.join(map(replace_insane, s))
391 while '__' in result:
392 result = result.replace('__', '_')
393 result = result.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted and result.startswith('-_'):
397 if result.startswith('-'):
398 result = '_' + result[len('-'):]
399 result = result.lstrip('.')
405 def sanitize_path(s):
406 """Sanitizes and normalizes path on Windows"""
407 if sys.platform != 'win32':
409 drive_or_unc, _ = os.path.splitdrive(s)
410 if sys.version_info < (2, 7) and not drive_or_unc:
411 drive_or_unc, _ = os.path.splitunc(s)
412 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
416 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
417 for path_part in norm_path]
419 sanitized_path.insert(0, drive_or_unc + os.path.sep)
420 return os.path.join(*sanitized_path)
423 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
424 # unwanted failures due to missing protocol
425 def sanitize_url(url):
426 return 'http:%s' % url if url.startswith('//') else url
429 def sanitized_Request(url, *args, **kwargs):
430 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
433 def orderedSet(iterable):
434 """ Remove all duplicates from the input iterable """
442 def _htmlentity_transform(entity):
443 """Transforms an HTML entity to a character."""
444 # Known non-numeric HTML entity
445 if entity in compat_html_entities.name2codepoint:
446 return compat_chr(compat_html_entities.name2codepoint[entity])
448 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
450 numstr = mobj.group(1)
451 if numstr.startswith('x'):
453 numstr = '0%s' % numstr
456 # See https://github.com/rg3/youtube-dl/issues/7518
458 return compat_chr(int(numstr, base))
462 # Unknown entity in name, return its literal representation
463 return '&%s;' % entity
469 assert type(s) == compat_str
472 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
475 def get_subprocess_encoding():
476 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
477 # For subprocess calls, encode with locale encoding
478 # Refer to http://stackoverflow.com/a/9951851/35070
479 encoding = preferredencoding()
481 encoding = sys.getfilesystemencoding()
487 def encodeFilename(s, for_subprocess=False):
489 @param s The name of the file
492 assert type(s) == compat_str
494 # Python 3 has a Unicode API
495 if sys.version_info >= (3, 0):
498 # Pass '' directly to use Unicode APIs on Windows 2000 and up
499 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
500 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
501 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
504 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
505 if sys.platform.startswith('java'):
508 return s.encode(get_subprocess_encoding(), 'ignore')
511 def decodeFilename(b, for_subprocess=False):
513 if sys.version_info >= (3, 0):
516 if not isinstance(b, bytes):
519 return b.decode(get_subprocess_encoding(), 'ignore')
522 def encodeArgument(s):
523 if not isinstance(s, compat_str):
524 # Legacy code that uses byte strings
525 # Uncomment the following line after fixing all post processors
526 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
527 s = s.decode('ascii')
528 return encodeFilename(s, True)
531 def decodeArgument(b):
532 return decodeFilename(b, True)
535 def decodeOption(optval):
538 if isinstance(optval, bytes):
539 optval = optval.decode(preferredencoding())
541 assert isinstance(optval, compat_str)
545 def formatSeconds(secs):
547 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
549 return '%d:%02d' % (secs // 60, secs % 60)
554 def make_HTTPS_handler(params, **kwargs):
555 opts_no_check_certificate = params.get('nocheckcertificate', False)
556 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
557 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
558 if opts_no_check_certificate:
559 context.check_hostname = False
560 context.verify_mode = ssl.CERT_NONE
562 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
565 # (create_default_context present but HTTPSHandler has no context=)
568 if sys.version_info < (3, 2):
569 return YoutubeDLHTTPSHandler(params, **kwargs)
571 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
572 context.verify_mode = (ssl.CERT_NONE
573 if opts_no_check_certificate
574 else ssl.CERT_REQUIRED)
575 context.set_default_verify_paths()
576 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
579 def bug_reports_message():
580 if ytdl_is_updateable():
581 update_cmd = 'type youtube-dl -U to update'
583 update_cmd = 'see https://yt-dl.org/update on how to update'
584 msg = '; please report this issue on https://yt-dl.org/bug .'
585 msg += ' Make sure you are using the latest version; %s.' % update_cmd
586 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
590 class ExtractorError(Exception):
591 """Error during info extraction."""
593 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
594 """ tb, if given, is the original traceback (so that it can be printed out).
595 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
598 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
600 if video_id is not None:
601 msg = video_id + ': ' + msg
603 msg += ' (caused by %r)' % cause
605 msg += bug_reports_message()
606 super(ExtractorError, self).__init__(msg)
609 self.exc_info = sys.exc_info() # preserve original exception
611 self.video_id = video_id
613 def format_traceback(self):
614 if self.traceback is None:
616 return ''.join(traceback.format_tb(self.traceback))
619 class UnsupportedError(ExtractorError):
620 def __init__(self, url):
621 super(UnsupportedError, self).__init__(
622 'Unsupported URL: %s' % url, expected=True)
626 class RegexNotFoundError(ExtractorError):
627 """Error when a regex didn't match"""
631 class DownloadError(Exception):
632 """Download Error exception.
634 This exception may be thrown by FileDownloader objects if they are not
635 configured to continue on errors. They will contain the appropriate
639 def __init__(self, msg, exc_info=None):
640 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
641 super(DownloadError, self).__init__(msg)
642 self.exc_info = exc_info
645 class SameFileError(Exception):
646 """Same File exception.
648 This exception will be thrown by FileDownloader objects if they detect
649 multiple files would have to be downloaded to the same file on disk.
654 class PostProcessingError(Exception):
655 """Post Processing exception.
657 This exception may be raised by PostProcessor's .run() method to
658 indicate an error in the postprocessing task.
661 def __init__(self, msg):
665 class MaxDownloadsReached(Exception):
666 """ --max-downloads limit has been reached. """
670 class UnavailableVideoError(Exception):
671 """Unavailable Format exception.
673 This exception will be thrown when a video is requested
674 in a format that is not available for that video.
679 class ContentTooShortError(Exception):
680 """Content Too Short exception.
682 This exception may be raised by FileDownloader objects when a file they
683 download is too small for what the server announced first, indicating
684 the connection was probably interrupted.
687 def __init__(self, downloaded, expected):
689 self.downloaded = downloaded
690 self.expected = expected
693 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
694 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
695 # expected HTTP responses to meet HTTP/1.0 or later (see also
696 # https://github.com/rg3/youtube-dl/issues/6727)
697 if sys.version_info < (3, 0):
698 kwargs[b'strict'] = True
699 hc = http_class(*args, **kwargs)
700 source_address = ydl_handler._params.get('source_address')
701 if source_address is not None:
702 sa = (source_address, 0)
703 if hasattr(hc, 'source_address'): # Python 2.7+
704 hc.source_address = sa
706 def _hc_connect(self, *args, **kwargs):
707 sock = compat_socket_create_connection(
708 (self.host, self.port), self.timeout, sa)
710 self.sock = ssl.wrap_socket(
711 sock, self.key_file, self.cert_file,
712 ssl_version=ssl.PROTOCOL_TLSv1)
715 hc.connect = functools.partial(_hc_connect, hc)
720 def handle_youtubedl_headers(headers):
721 filtered_headers = headers
723 if 'Youtubedl-no-compression' in filtered_headers:
724 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
725 del filtered_headers['Youtubedl-no-compression']
727 return filtered_headers
730 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
731 """Handler for HTTP requests and responses.
733 This class, when installed with an OpenerDirector, automatically adds
734 the standard headers to every HTTP request and handles gzipped and
735 deflated responses from web servers. If compression is to be avoided in
736 a particular request, the original request in the program code only has
737 to include the HTTP header "Youtubedl-no-compression", which will be
738 removed before making the real request.
740 Part of this code was copied from:
742 http://techknack.net/python-urllib2-handlers/
744 Andrew Rowls, the author of that code, agreed to release it to the
748 def __init__(self, params, *args, **kwargs):
749 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
750 self._params = params
752 def http_open(self, req):
753 return self.do_open(functools.partial(
754 _create_http_connection, self, compat_http_client.HTTPConnection, False),
760 return zlib.decompress(data, -zlib.MAX_WBITS)
762 return zlib.decompress(data)
765 def addinfourl_wrapper(stream, headers, url, code):
766 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
767 return compat_urllib_request.addinfourl(stream, headers, url, code)
768 ret = compat_urllib_request.addinfourl(stream, headers, url)
772 def http_request(self, req):
773 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
774 # always respected by websites, some tend to give out URLs with non percent-encoded
775 # non-ASCII characters (see telemb.py, ard.py [#3412])
776 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
777 # To work around aforementioned issue we will replace request's original URL with
778 # percent-encoded one
779 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
780 # the code of this workaround has been moved here from YoutubeDL.urlopen()
781 url = req.get_full_url()
782 url_escaped = escape_url(url)
784 # Substitute URL if any change after escaping
785 if url != url_escaped:
786 req = update_Request(req, url=url_escaped)
788 for h, v in std_headers.items():
789 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
790 # The dict keys are capitalized because of this bug by urllib
791 if h.capitalize() not in req.headers:
794 req.headers = handle_youtubedl_headers(req.headers)
796 if sys.version_info < (2, 7) and '#' in req.get_full_url():
797 # Python 2.6 is brain-dead when it comes to fragments
798 req._Request__original = req._Request__original.partition('#')[0]
799 req._Request__r_type = req._Request__r_type.partition('#')[0]
803 def http_response(self, req, resp):
806 if resp.headers.get('Content-encoding', '') == 'gzip':
807 content = resp.read()
808 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
810 uncompressed = io.BytesIO(gz.read())
811 except IOError as original_ioerror:
812 # There may be junk add the end of the file
813 # See http://stackoverflow.com/q/4928560/35070 for details
814 for i in range(1, 1024):
816 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
817 uncompressed = io.BytesIO(gz.read())
822 raise original_ioerror
823 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
824 resp.msg = old_resp.msg
825 del resp.headers['Content-encoding']
827 if resp.headers.get('Content-encoding', '') == 'deflate':
828 gz = io.BytesIO(self.deflate(resp.read()))
829 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
830 resp.msg = old_resp.msg
831 del resp.headers['Content-encoding']
832 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
833 # https://github.com/rg3/youtube-dl/issues/6457).
834 if 300 <= resp.code < 400:
835 location = resp.headers.get('Location')
837 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
838 if sys.version_info >= (3, 0):
839 location = location.encode('iso-8859-1').decode('utf-8')
840 location_escaped = escape_url(location)
841 if location != location_escaped:
842 del resp.headers['Location']
843 resp.headers['Location'] = location_escaped
846 https_request = http_request
847 https_response = http_response
850 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
851 def __init__(self, params, https_conn_class=None, *args, **kwargs):
852 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
853 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
854 self._params = params
856 def https_open(self, req):
858 if hasattr(self, '_context'): # python > 2.6
859 kwargs['context'] = self._context
860 if hasattr(self, '_check_hostname'): # python 3.x
861 kwargs['check_hostname'] = self._check_hostname
862 return self.do_open(functools.partial(
863 _create_http_connection, self, self._https_conn_class, True),
867 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
868 def __init__(self, cookiejar=None):
869 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
871 def http_response(self, request, response):
872 # Python 2 will choke on next HTTP request in row if there are non-ASCII
873 # characters in Set-Cookie HTTP header of last response (see
874 # https://github.com/rg3/youtube-dl/issues/6769).
875 # In order to at least prevent crashing we will percent encode Set-Cookie
876 # header before HTTPCookieProcessor starts processing it.
877 # if sys.version_info < (3, 0) and response.headers:
878 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
879 # set_cookie = response.headers.get(set_cookie_header)
881 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
882 # if set_cookie != set_cookie_escaped:
883 # del response.headers[set_cookie_header]
884 # response.headers[set_cookie_header] = set_cookie_escaped
885 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
887 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
888 https_response = http_response
891 def parse_iso8601(date_str, delimiter='T', timezone=None):
892 """ Return a UNIX timestamp from the given date """
897 date_str = re.sub(r'\.[0-9]+', '', date_str)
901 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
904 timezone = datetime.timedelta()
906 date_str = date_str[:-len(m.group(0))]
907 if not m.group('sign'):
908 timezone = datetime.timedelta()
910 sign = 1 if m.group('sign') == '+' else -1
911 timezone = datetime.timedelta(
912 hours=sign * int(m.group('hours')),
913 minutes=sign * int(m.group('minutes')))
915 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
916 dt = datetime.datetime.strptime(date_str, date_format) - timezone
917 return calendar.timegm(dt.timetuple())
922 def unified_strdate(date_str, day_first=True):
923 """Return a string with the date in the format YYYYMMDD"""
929 date_str = date_str.replace(',', ' ')
930 # %z (UTC offset) is only supported in python>=3.2
931 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
932 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
933 # Remove AM/PM + timezone
934 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
936 format_expressions = [
949 '%Y-%m-%d %H:%M:%S.%f',
952 '%Y-%m-%dT%H:%M:%SZ',
953 '%Y-%m-%dT%H:%M:%S.%fZ',
954 '%Y-%m-%dT%H:%M:%S.%f0Z',
956 '%Y-%m-%dT%H:%M:%S.%f',
960 format_expressions.extend([
968 format_expressions.extend([
975 for expression in format_expressions:
977 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
980 if upload_date is None:
981 timetuple = email.utils.parsedate_tz(date_str)
983 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
984 if upload_date is not None:
985 return compat_str(upload_date)
988 def determine_ext(url, default_ext='unknown_video'):
991 guess = url.partition('?')[0].rpartition('.')[2]
992 if re.match(r'^[A-Za-z0-9]+$', guess):
994 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
995 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
996 return guess.rstrip('/')
1001 def subtitles_filename(filename, sub_lang, sub_format):
1002 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1005 def date_from_str(date_str):
1007 Return a datetime object from a string in the format YYYYMMDD or
1008 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1009 today = datetime.date.today()
1010 if date_str in ('now', 'today'):
1012 if date_str == 'yesterday':
1013 return today - datetime.timedelta(days=1)
1014 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1015 if match is not None:
1016 sign = match.group('sign')
1017 time = int(match.group('time'))
1020 unit = match.group('unit')
1021 # A bad approximation?
1025 elif unit == 'year':
1029 delta = datetime.timedelta(**{unit: time})
1030 return today + delta
1031 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1034 def hyphenate_date(date_str):
1036 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1037 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1038 if match is not None:
1039 return '-'.join(match.groups())
1044 class DateRange(object):
1045 """Represents a time interval between two dates"""
1047 def __init__(self, start=None, end=None):
1048 """start and end must be strings in the format accepted by date"""
1049 if start is not None:
1050 self.start = date_from_str(start)
1052 self.start = datetime.datetime.min.date()
1054 self.end = date_from_str(end)
1056 self.end = datetime.datetime.max.date()
1057 if self.start > self.end:
1058 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1062 """Returns a range that only contains the given day"""
1063 return cls(day, day)
1065 def __contains__(self, date):
1066 """Check if the date is in the range"""
1067 if not isinstance(date, datetime.date):
1068 date = date_from_str(date)
1069 return self.start <= date <= self.end
1072 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1075 def platform_name():
1076 """ Returns the platform name as a compat_str """
1077 res = platform.platform()
1078 if isinstance(res, bytes):
1079 res = res.decode(preferredencoding())
1081 assert isinstance(res, compat_str)
1085 def _windows_write_string(s, out):
1086 """ Returns True if the string was written using special methods,
1087 False if it has yet to be written out."""
1088 # Adapted from http://stackoverflow.com/a/3259271/35070
1091 import ctypes.wintypes
1099 fileno = out.fileno()
1100 except AttributeError:
1101 # If the output stream doesn't have a fileno, it's virtual
1103 except io.UnsupportedOperation:
1104 # Some strange Windows pseudo files?
1106 if fileno not in WIN_OUTPUT_IDS:
1109 GetStdHandle = ctypes.WINFUNCTYPE(
1110 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1111 (b'GetStdHandle', ctypes.windll.kernel32))
1112 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1114 WriteConsoleW = ctypes.WINFUNCTYPE(
1115 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1116 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1117 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1118 written = ctypes.wintypes.DWORD(0)
1120 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1121 FILE_TYPE_CHAR = 0x0002
1122 FILE_TYPE_REMOTE = 0x8000
1123 GetConsoleMode = ctypes.WINFUNCTYPE(
1124 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1125 ctypes.POINTER(ctypes.wintypes.DWORD))(
1126 (b'GetConsoleMode', ctypes.windll.kernel32))
1127 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1129 def not_a_console(handle):
1130 if handle == INVALID_HANDLE_VALUE or handle is None:
1132 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1133 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1135 if not_a_console(h):
1138 def next_nonbmp_pos(s):
1140 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1141 except StopIteration:
1145 count = min(next_nonbmp_pos(s), 1024)
1147 ret = WriteConsoleW(
1148 h, s, count if count else 2, ctypes.byref(written), None)
1150 raise OSError('Failed to write string')
1151 if not count: # We just wrote a non-BMP character
1152 assert written.value == 2
1155 assert written.value > 0
1156 s = s[written.value:]
1160 def write_string(s, out=None, encoding=None):
1163 assert type(s) == compat_str
1165 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1166 if _windows_write_string(s, out):
1169 if ('b' in getattr(out, 'mode', '') or
1170 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1171 byt = s.encode(encoding or preferredencoding(), 'ignore')
1173 elif hasattr(out, 'buffer'):
1174 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1175 byt = s.encode(enc, 'ignore')
1176 out.buffer.write(byt)
1182 def bytes_to_intlist(bs):
1185 if isinstance(bs[0], int): # Python 3
1188 return [ord(c) for c in bs]
1191 def intlist_to_bytes(xs):
1194 return struct_pack('%dB' % len(xs), *xs)
1197 # Cross-platform file locking
1198 if sys.platform == 'win32':
1199 import ctypes.wintypes
1202 class OVERLAPPED(ctypes.Structure):
1204 ('Internal', ctypes.wintypes.LPVOID),
1205 ('InternalHigh', ctypes.wintypes.LPVOID),
1206 ('Offset', ctypes.wintypes.DWORD),
1207 ('OffsetHigh', ctypes.wintypes.DWORD),
1208 ('hEvent', ctypes.wintypes.HANDLE),
1211 kernel32 = ctypes.windll.kernel32
1212 LockFileEx = kernel32.LockFileEx
1213 LockFileEx.argtypes = [
1214 ctypes.wintypes.HANDLE, # hFile
1215 ctypes.wintypes.DWORD, # dwFlags
1216 ctypes.wintypes.DWORD, # dwReserved
1217 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1218 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1219 ctypes.POINTER(OVERLAPPED) # Overlapped
1221 LockFileEx.restype = ctypes.wintypes.BOOL
1222 UnlockFileEx = kernel32.UnlockFileEx
1223 UnlockFileEx.argtypes = [
1224 ctypes.wintypes.HANDLE, # hFile
1225 ctypes.wintypes.DWORD, # dwReserved
1226 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1227 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1228 ctypes.POINTER(OVERLAPPED) # Overlapped
1230 UnlockFileEx.restype = ctypes.wintypes.BOOL
1231 whole_low = 0xffffffff
1232 whole_high = 0x7fffffff
1234 def _lock_file(f, exclusive):
1235 overlapped = OVERLAPPED()
1236 overlapped.Offset = 0
1237 overlapped.OffsetHigh = 0
1238 overlapped.hEvent = 0
1239 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1240 handle = msvcrt.get_osfhandle(f.fileno())
1241 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1242 whole_low, whole_high, f._lock_file_overlapped_p):
1243 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1245 def _unlock_file(f):
1246 assert f._lock_file_overlapped_p
1247 handle = msvcrt.get_osfhandle(f.fileno())
1248 if not UnlockFileEx(handle, 0,
1249 whole_low, whole_high, f._lock_file_overlapped_p):
1250 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1253 # Some platforms, such as Jython, is missing fcntl
1257 def _lock_file(f, exclusive):
1258 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1260 def _unlock_file(f):
1261 fcntl.flock(f, fcntl.LOCK_UN)
1263 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1265 def _lock_file(f, exclusive):
1266 raise IOError(UNSUPPORTED_MSG)
1268 def _unlock_file(f):
1269 raise IOError(UNSUPPORTED_MSG)
1272 class locked_file(object):
1273 def __init__(self, filename, mode, encoding=None):
1274 assert mode in ['r', 'a', 'w']
1275 self.f = io.open(filename, mode, encoding=encoding)
1278 def __enter__(self):
1279 exclusive = self.mode != 'r'
1281 _lock_file(self.f, exclusive)
1287 def __exit__(self, etype, value, traceback):
1289 _unlock_file(self.f)
1296 def write(self, *args):
1297 return self.f.write(*args)
1299 def read(self, *args):
1300 return self.f.read(*args)
1303 def get_filesystem_encoding():
1304 encoding = sys.getfilesystemencoding()
1305 return encoding if encoding is not None else 'utf-8'
1308 def shell_quote(args):
1310 encoding = get_filesystem_encoding()
1312 if isinstance(a, bytes):
1313 # We may get a filename encoded with 'encodeFilename'
1314 a = a.decode(encoding)
1315 quoted_args.append(pipes.quote(a))
1316 return ' '.join(quoted_args)
1319 def smuggle_url(url, data):
1320 """ Pass additional data in a URL for internal use. """
1322 sdata = compat_urllib_parse_urlencode(
1323 {'__youtubedl_smuggle': json.dumps(data)})
1324 return url + '#' + sdata
1327 def unsmuggle_url(smug_url, default=None):
1328 if '#__youtubedl_smuggle' not in smug_url:
1329 return smug_url, default
1330 url, _, sdata = smug_url.rpartition('#')
1331 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1332 data = json.loads(jsond)
1336 def format_bytes(bytes):
1339 if type(bytes) is str:
1340 bytes = float(bytes)
1344 exponent = int(math.log(bytes, 1024.0))
1345 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1346 converted = float(bytes) / float(1024 ** exponent)
1347 return '%.2f%s' % (converted, suffix)
1350 def lookup_unit_table(unit_table, s):
1351 units_re = '|'.join(re.escape(u) for u in unit_table)
1353 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1356 num_str = m.group('num').replace(',', '.')
1357 mult = unit_table[m.group('unit')]
1358 return int(float(num_str) * mult)
1361 def parse_filesize(s):
1365 # The lower-case forms are of course incorrect and unofficial,
1366 # but we support those too
1404 return lookup_unit_table(_UNIT_TABLE, s)
1413 if re.match(r'^[\d,.]+$', s):
1414 return str_to_int(s)
1425 return lookup_unit_table(_UNIT_TABLE, s)
1428 def month_by_name(name):
1429 """ Return the number of a month by (locale-independently) English name """
1432 return ENGLISH_MONTH_NAMES.index(name) + 1
1437 def month_by_abbreviation(abbrev):
1438 """ Return the number of a month by (locale-independently) English
1442 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1447 def fix_xml_ampersands(xml_str):
1448 """Replace all the '&' by '&' in XML"""
1450 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1455 def setproctitle(title):
1456 assert isinstance(title, compat_str)
1458 # ctypes in Jython is not complete
1459 # http://bugs.jython.org/issue2148
1460 if sys.platform.startswith('java'):
1464 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1467 title_bytes = title.encode('utf-8')
1468 buf = ctypes.create_string_buffer(len(title_bytes))
1469 buf.value = title_bytes
1471 libc.prctl(15, buf, 0, 0, 0)
1472 except AttributeError:
1473 return # Strange libc, just skip this
1476 def remove_start(s, start):
1477 if s.startswith(start):
1478 return s[len(start):]
1482 def remove_end(s, end):
1484 return s[:-len(end)]
1488 def remove_quotes(s):
1489 if s is None or len(s) < 2:
1491 for quote in ('"', "'", ):
1492 if s[0] == quote and s[-1] == quote:
1497 def url_basename(url):
1498 path = compat_urlparse.urlparse(url).path
1499 return path.strip('/').split('/')[-1]
1502 class HEADRequest(compat_urllib_request.Request):
1503 def get_method(self):
1507 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1510 v = getattr(v, get_attr, None)
1516 return int(v) * invscale // scale
1521 def str_or_none(v, default=None):
1522 return default if v is None else compat_str(v)
1525 def str_to_int(int_str):
1526 """ A more relaxed version of int_or_none """
1529 int_str = re.sub(r'[,\.\+]', '', int_str)
1533 def float_or_none(v, scale=1, invscale=1, default=None):
1537 return float(v) * invscale / scale
1542 def parse_duration(s):
1543 if not isinstance(s, compat_basestring):
1548 days, hours, mins, secs, ms = [None] * 5
1549 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1551 days, hours, mins, secs, ms = m.groups()
1556 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1559 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1562 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1565 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1568 days, hours, mins, secs, ms = m.groups()
1570 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1572 hours, mins = m.groups()
1578 duration += float(secs)
1580 duration += float(mins) * 60
1582 duration += float(hours) * 60 * 60
1584 duration += float(days) * 24 * 60 * 60
1586 duration += float(ms)
1590 def prepend_extension(filename, ext, expected_real_ext=None):
1591 name, real_ext = os.path.splitext(filename)
1593 '{0}.{1}{2}'.format(name, ext, real_ext)
1594 if not expected_real_ext or real_ext[1:] == expected_real_ext
1595 else '{0}.{1}'.format(filename, ext))
1598 def replace_extension(filename, ext, expected_real_ext=None):
1599 name, real_ext = os.path.splitext(filename)
1600 return '{0}.{1}'.format(
1601 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1605 def check_executable(exe, args=[]):
1606 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1607 args can be a list of arguments for a short output (like -version) """
1609 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1615 def get_exe_version(exe, args=['--version'],
1616 version_re=None, unrecognized='present'):
1617 """ Returns the version of the specified executable,
1618 or False if the executable is not present """
1620 out, _ = subprocess.Popen(
1621 [encodeArgument(exe)] + args,
1622 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1625 if isinstance(out, bytes): # Python 2.x
1626 out = out.decode('ascii', 'ignore')
1627 return detect_exe_version(out, version_re, unrecognized)
1630 def detect_exe_version(output, version_re=None, unrecognized='present'):
1631 assert isinstance(output, compat_str)
1632 if version_re is None:
1633 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1634 m = re.search(version_re, output)
1641 class PagedList(object):
1643 # This is only useful for tests
1644 return len(self.getslice())
1647 class OnDemandPagedList(PagedList):
1648 def __init__(self, pagefunc, pagesize, use_cache=False):
1649 self._pagefunc = pagefunc
1650 self._pagesize = pagesize
1651 self._use_cache = use_cache
1655 def getslice(self, start=0, end=None):
1657 for pagenum in itertools.count(start // self._pagesize):
1658 firstid = pagenum * self._pagesize
1659 nextfirstid = pagenum * self._pagesize + self._pagesize
1660 if start >= nextfirstid:
1665 page_results = self._cache.get(pagenum)
1666 if page_results is None:
1667 page_results = list(self._pagefunc(pagenum))
1669 self._cache[pagenum] = page_results
1672 start % self._pagesize
1673 if firstid <= start < nextfirstid
1677 ((end - 1) % self._pagesize) + 1
1678 if (end is not None and firstid <= end <= nextfirstid)
1681 if startv != 0 or endv is not None:
1682 page_results = page_results[startv:endv]
1683 res.extend(page_results)
1685 # A little optimization - if current page is not "full", ie. does
1686 # not contain page_size videos then we can assume that this page
1687 # is the last one - there are no more ids on further pages -
1688 # i.e. no need to query again.
1689 if len(page_results) + startv < self._pagesize:
1692 # If we got the whole page, but the next page is not interesting,
1693 # break out early as well
1694 if end == nextfirstid:
1699 class InAdvancePagedList(PagedList):
1700 def __init__(self, pagefunc, pagecount, pagesize):
1701 self._pagefunc = pagefunc
1702 self._pagecount = pagecount
1703 self._pagesize = pagesize
1705 def getslice(self, start=0, end=None):
1707 start_page = start // self._pagesize
1709 self._pagecount if end is None else (end // self._pagesize + 1))
1710 skip_elems = start - start_page * self._pagesize
1711 only_more = None if end is None else end - start
1712 for pagenum in range(start_page, end_page):
1713 page = list(self._pagefunc(pagenum))
1715 page = page[skip_elems:]
1717 if only_more is not None:
1718 if len(page) < only_more:
1719 only_more -= len(page)
1721 page = page[:only_more]
1728 def uppercase_escape(s):
1729 unicode_escape = codecs.getdecoder('unicode_escape')
1731 r'\\U[0-9a-fA-F]{8}',
1732 lambda m: unicode_escape(m.group(0))[0],
1736 def lowercase_escape(s):
1737 unicode_escape = codecs.getdecoder('unicode_escape')
1739 r'\\u[0-9a-fA-F]{4}',
1740 lambda m: unicode_escape(m.group(0))[0],
1744 def escape_rfc3986(s):
1745 """Escape non-ASCII characters as suggested by RFC 3986"""
1746 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1747 s = s.encode('utf-8')
1748 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1751 def escape_url(url):
1752 """Escape URL as suggested by RFC 3986"""
1753 url_parsed = compat_urllib_parse_urlparse(url)
1754 return url_parsed._replace(
1755 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1756 path=escape_rfc3986(url_parsed.path),
1757 params=escape_rfc3986(url_parsed.params),
1758 query=escape_rfc3986(url_parsed.query),
1759 fragment=escape_rfc3986(url_parsed.fragment)
1763 struct.pack('!I', 0)
1765 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1766 # See https://bugs.python.org/issue19099
1767 def struct_pack(spec, *args):
1768 if isinstance(spec, compat_str):
1769 spec = spec.encode('ascii')
1770 return struct.pack(spec, *args)
1772 def struct_unpack(spec, *args):
1773 if isinstance(spec, compat_str):
1774 spec = spec.encode('ascii')
1775 return struct.unpack(spec, *args)
1777 struct_pack = struct.pack
1778 struct_unpack = struct.unpack
1781 def read_batch_urls(batch_fd):
1783 if not isinstance(url, compat_str):
1784 url = url.decode('utf-8', 'replace')
1785 BOM_UTF8 = '\xef\xbb\xbf'
1786 if url.startswith(BOM_UTF8):
1787 url = url[len(BOM_UTF8):]
1789 if url.startswith(('#', ';', ']')):
1793 with contextlib.closing(batch_fd) as fd:
1794 return [url for url in map(fixup, fd) if url]
1797 def urlencode_postdata(*args, **kargs):
1798 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1801 def update_url_query(url, query):
1804 parsed_url = compat_urlparse.urlparse(url)
1805 qs = compat_parse_qs(parsed_url.query)
1807 return compat_urlparse.urlunparse(parsed_url._replace(
1808 query=compat_urllib_parse_urlencode(qs, True)))
1811 def update_Request(req, url=None, data=None, headers={}, query={}):
1812 req_headers = req.headers.copy()
1813 req_headers.update(headers)
1814 req_data = data or req.data
1815 req_url = update_url_query(url or req.get_full_url(), query)
1816 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1818 req_url, data=req_data, headers=req_headers,
1819 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1820 if hasattr(req, 'timeout'):
1821 new_req.timeout = req.timeout
1825 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1826 if isinstance(key_or_keys, (list, tuple)):
1827 for key in key_or_keys:
1828 if key not in d or d[key] is None or skip_false_values and not d[key]:
1832 return d.get(key_or_keys, default)
1835 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1836 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1848 def parse_age_limit(s):
1851 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1852 return int(m.group('age')) if m else US_RATINGS.get(s)
1855 def strip_jsonp(code):
1857 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1860 def js_to_json(code):
1863 if v in ('true', 'false', 'null'):
1865 if v.startswith('"'):
1866 v = re.sub(r"\\'", "'", v[1:-1])
1867 elif v.startswith("'"):
1869 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1876 res = re.sub(r'''(?x)
1877 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1878 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1879 [a-zA-Z_][.a-zA-Z_0-9]*
1881 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1885 def qualities(quality_ids):
1886 """ Get a numeric quality value out of a list of possible values """
1889 return quality_ids.index(qid)
1895 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1898 def limit_length(s, length):
1899 """ Add ellipses to overly long strings """
1904 return s[:length - len(ELLIPSES)] + ELLIPSES
1908 def version_tuple(v):
1909 return tuple(int(e) for e in re.split(r'[-.]', v))
1912 def is_outdated_version(version, limit, assume_new=True):
1914 return not assume_new
1916 return version_tuple(version) < version_tuple(limit)
1918 return not assume_new
1921 def ytdl_is_updateable():
1922 """ Returns if youtube-dl can be updated with -U """
1923 from zipimport import zipimporter
1925 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1928 def args_to_str(args):
1929 # Get a short string representation for a subprocess command
1930 return ' '.join(shlex_quote(a) for a in args)
1933 def error_to_compat_str(err):
1935 # On python 2 error byte string must be decoded with proper
1936 # encoding rather than ascii
1937 if sys.version_info[0] < 3:
1938 err_str = err_str.decode(preferredencoding())
1942 def mimetype2ext(mt):
1952 _, _, res = mt.rpartition('/')
1956 'smptett+xml': 'tt',
1962 'x-mp4-fragmented': 'mp4',
1967 def urlhandle_detect_ext(url_handle):
1970 getheader = lambda h: url_handle.headers[h]
1971 except AttributeError: # Python < 3
1972 getheader = url_handle.info().getheader
1974 cd = getheader('Content-Disposition')
1976 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1978 e = determine_ext(m.group('filename'), default_ext=None)
1982 return mimetype2ext(getheader('Content-Type'))
1985 def encode_data_uri(data, mime_type):
1986 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1989 def age_restricted(content_limit, age_limit):
1990 """ Returns True iff the content should be blocked """
1992 if age_limit is None: # No limit set
1994 if content_limit is None:
1995 return False # Content available for everyone
1996 return age_limit < content_limit
1999 def is_html(first_bytes):
2000 """ Detect whether a file contains HTML by examining its first bytes. """
2003 (b'\xef\xbb\xbf', 'utf-8'),
2004 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2005 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2006 (b'\xff\xfe', 'utf-16-le'),
2007 (b'\xfe\xff', 'utf-16-be'),
2009 for bom, enc in BOMS:
2010 if first_bytes.startswith(bom):
2011 s = first_bytes[len(bom):].decode(enc, 'replace')
2014 s = first_bytes.decode('utf-8', 'replace')
2016 return re.match(r'^\s*<', s)
2019 def determine_protocol(info_dict):
2020 protocol = info_dict.get('protocol')
2021 if protocol is not None:
2024 url = info_dict['url']
2025 if url.startswith('rtmp'):
2027 elif url.startswith('mms'):
2029 elif url.startswith('rtsp'):
2032 ext = determine_ext(url)
2038 return compat_urllib_parse_urlparse(url).scheme
2041 def render_table(header_row, data):
2042 """ Render a list of rows, each as a list of values """
2043 table = [header_row] + data
2044 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2045 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2046 return '\n'.join(format_str % tuple(row) for row in table)
2049 def _match_one(filter_part, dct):
2050 COMPARISON_OPERATORS = {
2058 operator_rex = re.compile(r'''(?x)\s*
2060 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2062 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2063 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2066 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2067 m = operator_rex.search(filter_part)
2069 op = COMPARISON_OPERATORS[m.group('op')]
2070 if m.group('strval') is not None:
2071 if m.group('op') not in ('=', '!='):
2073 'Operator %s does not support string values!' % m.group('op'))
2074 comparison_value = m.group('strval')
2077 comparison_value = int(m.group('intval'))
2079 comparison_value = parse_filesize(m.group('intval'))
2080 if comparison_value is None:
2081 comparison_value = parse_filesize(m.group('intval') + 'B')
2082 if comparison_value is None:
2084 'Invalid integer value %r in filter part %r' % (
2085 m.group('intval'), filter_part))
2086 actual_value = dct.get(m.group('key'))
2087 if actual_value is None:
2088 return m.group('none_inclusive')
2089 return op(actual_value, comparison_value)
2092 '': lambda v: v is not None,
2093 '!': lambda v: v is None,
2095 operator_rex = re.compile(r'''(?x)\s*
2096 (?P<op>%s)\s*(?P<key>[a-z_]+)
2098 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2099 m = operator_rex.search(filter_part)
2101 op = UNARY_OPERATORS[m.group('op')]
2102 actual_value = dct.get(m.group('key'))
2103 return op(actual_value)
2105 raise ValueError('Invalid filter part %r' % filter_part)
2108 def match_str(filter_str, dct):
2109 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2112 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2115 def match_filter_func(filter_str):
2116 def _match_func(info_dict):
2117 if match_str(filter_str, info_dict):
2120 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2121 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2125 def parse_dfxp_time_expr(time_expr):
2129 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2131 return float(mobj.group('time_offset'))
2133 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2135 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2138 def srt_subtitles_timecode(seconds):
2139 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2142 def dfxp2srt(dfxp_data):
2143 _x = functools.partial(xpath_with_ns, ns_map={
2144 'ttml': 'http://www.w3.org/ns/ttml',
2145 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2146 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2149 class TTMLPElementParser(object):
2152 def start(self, tag, attrib):
2153 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2159 def data(self, data):
2163 return self.out.strip()
2165 def parse_node(node):
2166 target = TTMLPElementParser()
2167 parser = xml.etree.ElementTree.XMLParser(target=target)
2168 parser.feed(xml.etree.ElementTree.tostring(node))
2169 return parser.close()
2171 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2173 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2176 raise ValueError('Invalid dfxp/TTML subtitle')
2178 for para, index in zip(paras, itertools.count(1)):
2179 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2180 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2181 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2182 if begin_time is None:
2187 end_time = begin_time + dur
2188 out.append('%d\n%s --> %s\n%s\n\n' % (
2190 srt_subtitles_timecode(begin_time),
2191 srt_subtitles_timecode(end_time),
2197 def cli_option(params, command_option, param):
2198 param = params.get(param)
2199 return [command_option, param] if param is not None else []
2202 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2203 param = params.get(param)
2204 assert isinstance(param, bool)
2206 return [command_option + separator + (true_value if param else false_value)]
2207 return [command_option, true_value if param else false_value]
2210 def cli_valueless_option(params, command_option, param, expected_value=True):
2211 param = params.get(param)
2212 return [command_option] if param == expected_value else []
2215 def cli_configuration_args(params, param, default=[]):
2216 ex_args = params.get(param)
2219 assert isinstance(ex_args, list)
2223 class ISO639Utils(object):
2224 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2413 def short2long(cls, code):
2414 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2415 return cls._lang_map.get(code[:2])
2418 def long2short(cls, code):
2419 """Convert language code from ISO 639-2/T to ISO 639-1"""
2420 for short_name, long_name in cls._lang_map.items():
2421 if long_name == code:
2425 class ISO3166Utils(object):
2426 # From http://data.okfn.org/data/core/country-list
2428 'AF': 'Afghanistan',
2429 'AX': 'Åland Islands',
2432 'AS': 'American Samoa',
2437 'AG': 'Antigua and Barbuda',
2454 'BO': 'Bolivia, Plurinational State of',
2455 'BQ': 'Bonaire, Sint Eustatius and Saba',
2456 'BA': 'Bosnia and Herzegovina',
2458 'BV': 'Bouvet Island',
2460 'IO': 'British Indian Ocean Territory',
2461 'BN': 'Brunei Darussalam',
2463 'BF': 'Burkina Faso',
2469 'KY': 'Cayman Islands',
2470 'CF': 'Central African Republic',
2474 'CX': 'Christmas Island',
2475 'CC': 'Cocos (Keeling) Islands',
2479 'CD': 'Congo, the Democratic Republic of the',
2480 'CK': 'Cook Islands',
2482 'CI': 'Côte d\'Ivoire',
2487 'CZ': 'Czech Republic',
2491 'DO': 'Dominican Republic',
2494 'SV': 'El Salvador',
2495 'GQ': 'Equatorial Guinea',
2499 'FK': 'Falkland Islands (Malvinas)',
2500 'FO': 'Faroe Islands',
2504 'GF': 'French Guiana',
2505 'PF': 'French Polynesia',
2506 'TF': 'French Southern Territories',
2521 'GW': 'Guinea-Bissau',
2524 'HM': 'Heard Island and McDonald Islands',
2525 'VA': 'Holy See (Vatican City State)',
2532 'IR': 'Iran, Islamic Republic of',
2535 'IM': 'Isle of Man',
2545 'KP': 'Korea, Democratic People\'s Republic of',
2546 'KR': 'Korea, Republic of',
2549 'LA': 'Lao People\'s Democratic Republic',
2555 'LI': 'Liechtenstein',
2559 'MK': 'Macedonia, the Former Yugoslav Republic of',
2566 'MH': 'Marshall Islands',
2572 'FM': 'Micronesia, Federated States of',
2573 'MD': 'Moldova, Republic of',
2584 'NL': 'Netherlands',
2585 'NC': 'New Caledonia',
2586 'NZ': 'New Zealand',
2591 'NF': 'Norfolk Island',
2592 'MP': 'Northern Mariana Islands',
2597 'PS': 'Palestine, State of',
2599 'PG': 'Papua New Guinea',
2602 'PH': 'Philippines',
2606 'PR': 'Puerto Rico',
2610 'RU': 'Russian Federation',
2612 'BL': 'Saint Barthélemy',
2613 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2614 'KN': 'Saint Kitts and Nevis',
2615 'LC': 'Saint Lucia',
2616 'MF': 'Saint Martin (French part)',
2617 'PM': 'Saint Pierre and Miquelon',
2618 'VC': 'Saint Vincent and the Grenadines',
2621 'ST': 'Sao Tome and Principe',
2622 'SA': 'Saudi Arabia',
2626 'SL': 'Sierra Leone',
2628 'SX': 'Sint Maarten (Dutch part)',
2631 'SB': 'Solomon Islands',
2633 'ZA': 'South Africa',
2634 'GS': 'South Georgia and the South Sandwich Islands',
2635 'SS': 'South Sudan',
2640 'SJ': 'Svalbard and Jan Mayen',
2643 'CH': 'Switzerland',
2644 'SY': 'Syrian Arab Republic',
2645 'TW': 'Taiwan, Province of China',
2647 'TZ': 'Tanzania, United Republic of',
2649 'TL': 'Timor-Leste',
2653 'TT': 'Trinidad and Tobago',
2656 'TM': 'Turkmenistan',
2657 'TC': 'Turks and Caicos Islands',
2661 'AE': 'United Arab Emirates',
2662 'GB': 'United Kingdom',
2663 'US': 'United States',
2664 'UM': 'United States Minor Outlying Islands',
2668 'VE': 'Venezuela, Bolivarian Republic of',
2670 'VG': 'Virgin Islands, British',
2671 'VI': 'Virgin Islands, U.S.',
2672 'WF': 'Wallis and Futuna',
2673 'EH': 'Western Sahara',
2680 def short2full(cls, code):
2681 """Convert an ISO 3166-2 country code to the corresponding full name"""
2682 return cls._country_map.get(code.upper())
2685 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2686 def __init__(self, proxies=None):
2687 # Set default handlers
2688 for type in ('http', 'https'):
2689 setattr(self, '%s_open' % type,
2690 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2691 meth(r, proxy, type))
2692 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2694 def proxy_open(self, req, proxy, type):
2695 req_proxy = req.headers.get('Ytdl-request-proxy')
2696 if req_proxy is not None:
2698 del req.headers['Ytdl-request-proxy']
2700 if proxy == '__noproxy__':
2701 return None # No Proxy
2702 return compat_urllib_request.ProxyHandler.proxy_open(
2703 self, req, proxy, type)
2706 def ohdave_rsa_encrypt(data, exponent, modulus):
2708 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2711 data: data to encrypt, bytes-like object
2712 exponent, modulus: parameter e and N of RSA algorithm, both integer
2713 Output: hex string of encrypted data
2715 Limitation: supports one block encryption only
2718 payload = int(binascii.hexlify(data[::-1]), 16)
2719 encrypted = pow(payload, exponent, modulus)
2720 return '%x' % encrypted
2723 def encode_base_n(num, n, table=None):
2724 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2726 table = FULL_TABLE[:n]
2729 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2736 ret = table[num % n] + ret
2741 def decode_packed_codes(code):
2743 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2745 obfucasted_code, base, count, symbols = mobj.groups()
2748 symbols = symbols.split('|')
2753 base_n_count = encode_base_n(count, base)
2754 symbol_table[base_n_count] = symbols[count] or base_n_count
2757 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],