2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
34 import xml.etree.ElementTree
41 compat_etree_fromstring,
46 compat_socket_create_connection,
50 compat_urllib_parse_urlencode,
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
59 # This is not clearly defined otherwise
60 compiled_regex_type = type(re.compile(''))
63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
73 ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
90 'f4f', 'f4m', 'm3u8', 'smil')
93 def preferredencoding():
94 """Get preferred encoding.
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
100 pref = locale.getpreferredencoding()
108 def write_json_file(obj, fn):
109 """ Encode obj as JSON and write it to fn, atomically if possible """
111 fn = encodeFilename(fn)
112 if sys.version_info < (3, 0) and sys.platform != 'win32':
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
153 os.rename(tf.name, fn)
162 if sys.version_info >= (2, 7):
163 def find_xpath_attr(node, xpath, key, val=None):
164 """ Find the xpath xpath[@key=val] """
165 assert re.match(r'^[a-zA-Z_-]+$', key)
166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
167 return node.find(expr)
169 def find_xpath_attr(node, xpath, key, val=None):
170 for f in node.findall(compat_xpath(xpath)):
171 if key not in f.attrib:
173 if val is None or f.attrib.get(key) == val:
177 # On python2.6 the xml.etree.ElementTree.Element methods don't support
178 # the namespace parameter
181 def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
186 replaced.append(c[0])
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
193 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
194 def _find_xpath(xpath):
195 return node.find(compat_xpath(xpath))
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
206 if default is not NO_DEFAULT:
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
216 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
221 if default is not NO_DEFAULT:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
231 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
234 if default is not NO_DEFAULT:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
244 def get_element_by_id(id, html):
245 """Return the content of the tag with the specified ID in the passed HTML document"""
246 return get_element_by_attribute('id', id, html)
249 def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
252 m = re.search(r'''(?xs)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
260 ''' % (re.escape(attribute), re.escape(value)), html)
264 res = m.group('content')
266 if res.startswith('"') or res.startswith("'"):
269 return unescapeHTML(res)
272 class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
276 compat_HTMLParser.__init__(self)
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
282 def extract_attributes(html_element):
283 """Given a string for an HTML element such as
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&"
289 Decode and return a dictionary of attributes.
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
304 def clean_html(html):
305 """Clean an HTML snippet into a readable string"""
307 if html is None: # Convenience for sanitizing descriptions etc.
311 html = html.replace('\n', ' ')
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
321 def sanitize_open(filename, open_mode):
322 """Try to open the given filename, and slightly tweak it if this fails.
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
329 It returns the tuple (stream, definitive_file_name).
333 if sys.platform == 'win32':
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
340 if err.errno in (errno.EACCES,):
343 # In case of error, try to remove win32 forbidden chars
344 alt_filename = sanitize_path(filename)
345 if alt_filename == filename:
348 # An exception here should be caught in the caller
349 stream = open(encodeFilename(alt_filename), open_mode)
350 return (stream, alt_filename)
353 def timeconvert(timestr):
354 """Convert RFC 2822 defined time string into system timestamp"""
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
362 def sanitize_filename(s, restricted=False, is_id=False):
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
371 return '' if restricted else '\''
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
378 if restricted and ord(char) > 127:
383 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
384 result = ''.join(map(replace_insane, s))
386 while '__' in result:
387 result = result.replace('__', '_')
388 result = result.strip('_')
389 # Common case of "Foreign band name - English song title"
390 if restricted and result.startswith('-_'):
392 if result.startswith('-'):
393 result = '_' + result[len('-'):]
394 result = result.lstrip('.')
400 def sanitize_path(s):
401 """Sanitizes and normalizes path on Windows"""
402 if sys.platform != 'win32':
404 drive_or_unc, _ = os.path.splitdrive(s)
405 if sys.version_info < (2, 7) and not drive_or_unc:
406 drive_or_unc, _ = os.path.splitunc(s)
407 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
411 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
412 for path_part in norm_path]
414 sanitized_path.insert(0, drive_or_unc + os.path.sep)
415 return os.path.join(*sanitized_path)
418 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
419 # unwanted failures due to missing protocol
420 def sanitized_Request(url, *args, **kwargs):
421 return compat_urllib_request.Request(
422 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
425 def orderedSet(iterable):
426 """ Remove all duplicates from the input iterable """
434 def _htmlentity_transform(entity):
435 """Transforms an HTML entity to a character."""
436 # Known non-numeric HTML entity
437 if entity in compat_html_entities.name2codepoint:
438 return compat_chr(compat_html_entities.name2codepoint[entity])
440 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
442 numstr = mobj.group(1)
443 if numstr.startswith('x'):
445 numstr = '0%s' % numstr
448 # See https://github.com/rg3/youtube-dl/issues/7518
450 return compat_chr(int(numstr, base))
454 # Unknown entity in name, return its literal representation
455 return '&%s;' % entity
461 assert type(s) == compat_str
464 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
467 def get_subprocess_encoding():
468 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
469 # For subprocess calls, encode with locale encoding
470 # Refer to http://stackoverflow.com/a/9951851/35070
471 encoding = preferredencoding()
473 encoding = sys.getfilesystemencoding()
479 def encodeFilename(s, for_subprocess=False):
481 @param s The name of the file
484 assert type(s) == compat_str
486 # Python 3 has a Unicode API
487 if sys.version_info >= (3, 0):
490 # Pass '' directly to use Unicode APIs on Windows 2000 and up
491 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
492 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
493 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
496 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
497 if sys.platform.startswith('java'):
500 return s.encode(get_subprocess_encoding(), 'ignore')
503 def decodeFilename(b, for_subprocess=False):
505 if sys.version_info >= (3, 0):
508 if not isinstance(b, bytes):
511 return b.decode(get_subprocess_encoding(), 'ignore')
514 def encodeArgument(s):
515 if not isinstance(s, compat_str):
516 # Legacy code that uses byte strings
517 # Uncomment the following line after fixing all post processors
518 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
519 s = s.decode('ascii')
520 return encodeFilename(s, True)
523 def decodeArgument(b):
524 return decodeFilename(b, True)
527 def decodeOption(optval):
530 if isinstance(optval, bytes):
531 optval = optval.decode(preferredencoding())
533 assert isinstance(optval, compat_str)
537 def formatSeconds(secs):
539 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
541 return '%d:%02d' % (secs // 60, secs % 60)
546 def make_HTTPS_handler(params, **kwargs):
547 opts_no_check_certificate = params.get('nocheckcertificate', False)
548 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
549 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
550 if opts_no_check_certificate:
551 context.check_hostname = False
552 context.verify_mode = ssl.CERT_NONE
554 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
557 # (create_default_context present but HTTPSHandler has no context=)
560 if sys.version_info < (3, 2):
561 return YoutubeDLHTTPSHandler(params, **kwargs)
563 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
564 context.verify_mode = (ssl.CERT_NONE
565 if opts_no_check_certificate
566 else ssl.CERT_REQUIRED)
567 context.set_default_verify_paths()
568 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
571 def bug_reports_message():
572 if ytdl_is_updateable():
573 update_cmd = 'type youtube-dl -U to update'
575 update_cmd = 'see https://yt-dl.org/update on how to update'
576 msg = '; please report this issue on https://yt-dl.org/bug .'
577 msg += ' Make sure you are using the latest version; %s.' % update_cmd
578 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
582 class ExtractorError(Exception):
583 """Error during info extraction."""
585 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
586 """ tb, if given, is the original traceback (so that it can be printed out).
587 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
590 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
592 if video_id is not None:
593 msg = video_id + ': ' + msg
595 msg += ' (caused by %r)' % cause
597 msg += bug_reports_message()
598 super(ExtractorError, self).__init__(msg)
601 self.exc_info = sys.exc_info() # preserve original exception
603 self.video_id = video_id
605 def format_traceback(self):
606 if self.traceback is None:
608 return ''.join(traceback.format_tb(self.traceback))
611 class UnsupportedError(ExtractorError):
612 def __init__(self, url):
613 super(UnsupportedError, self).__init__(
614 'Unsupported URL: %s' % url, expected=True)
618 class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
623 class DownloadError(Exception):
624 """Download Error exception.
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
631 def __init__(self, msg, exc_info=None):
632 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
633 super(DownloadError, self).__init__(msg)
634 self.exc_info = exc_info
637 class SameFileError(Exception):
638 """Same File exception.
640 This exception will be thrown by FileDownloader objects if they detect
641 multiple files would have to be downloaded to the same file on disk.
646 class PostProcessingError(Exception):
647 """Post Processing exception.
649 This exception may be raised by PostProcessor's .run() method to
650 indicate an error in the postprocessing task.
653 def __init__(self, msg):
657 class MaxDownloadsReached(Exception):
658 """ --max-downloads limit has been reached. """
662 class UnavailableVideoError(Exception):
663 """Unavailable Format exception.
665 This exception will be thrown when a video is requested
666 in a format that is not available for that video.
671 class ContentTooShortError(Exception):
672 """Content Too Short exception.
674 This exception may be raised by FileDownloader objects when a file they
675 download is too small for what the server announced first, indicating
676 the connection was probably interrupted.
679 def __init__(self, downloaded, expected):
681 self.downloaded = downloaded
682 self.expected = expected
685 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
686 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
687 # expected HTTP responses to meet HTTP/1.0 or later (see also
688 # https://github.com/rg3/youtube-dl/issues/6727)
689 if sys.version_info < (3, 0):
690 kwargs[b'strict'] = True
691 hc = http_class(*args, **kwargs)
692 source_address = ydl_handler._params.get('source_address')
693 if source_address is not None:
694 sa = (source_address, 0)
695 if hasattr(hc, 'source_address'): # Python 2.7+
696 hc.source_address = sa
698 def _hc_connect(self, *args, **kwargs):
699 sock = compat_socket_create_connection(
700 (self.host, self.port), self.timeout, sa)
702 self.sock = ssl.wrap_socket(
703 sock, self.key_file, self.cert_file,
704 ssl_version=ssl.PROTOCOL_TLSv1)
707 hc.connect = functools.partial(_hc_connect, hc)
712 def handle_youtubedl_headers(headers):
713 filtered_headers = headers
715 if 'Youtubedl-no-compression' in filtered_headers:
716 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
717 del filtered_headers['Youtubedl-no-compression']
719 return filtered_headers
722 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
723 """Handler for HTTP requests and responses.
725 This class, when installed with an OpenerDirector, automatically adds
726 the standard headers to every HTTP request and handles gzipped and
727 deflated responses from web servers. If compression is to be avoided in
728 a particular request, the original request in the program code only has
729 to include the HTTP header "Youtubedl-no-compression", which will be
730 removed before making the real request.
732 Part of this code was copied from:
734 http://techknack.net/python-urllib2-handlers/
736 Andrew Rowls, the author of that code, agreed to release it to the
740 def __init__(self, params, *args, **kwargs):
741 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
742 self._params = params
744 def http_open(self, req):
745 return self.do_open(functools.partial(
746 _create_http_connection, self, compat_http_client.HTTPConnection, False),
752 return zlib.decompress(data, -zlib.MAX_WBITS)
754 return zlib.decompress(data)
757 def addinfourl_wrapper(stream, headers, url, code):
758 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
759 return compat_urllib_request.addinfourl(stream, headers, url, code)
760 ret = compat_urllib_request.addinfourl(stream, headers, url)
764 def http_request(self, req):
765 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
766 # always respected by websites, some tend to give out URLs with non percent-encoded
767 # non-ASCII characters (see telemb.py, ard.py [#3412])
768 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
769 # To work around aforementioned issue we will replace request's original URL with
770 # percent-encoded one
771 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
772 # the code of this workaround has been moved here from YoutubeDL.urlopen()
773 url = req.get_full_url()
774 url_escaped = escape_url(url)
776 # Substitute URL if any change after escaping
777 if url != url_escaped:
778 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
780 url_escaped, data=req.data, headers=req.headers,
781 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
782 new_req.timeout = req.timeout
785 for h, v in std_headers.items():
786 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
787 # The dict keys are capitalized because of this bug by urllib
788 if h.capitalize() not in req.headers:
791 req.headers = handle_youtubedl_headers(req.headers)
793 if sys.version_info < (2, 7) and '#' in req.get_full_url():
794 # Python 2.6 is brain-dead when it comes to fragments
795 req._Request__original = req._Request__original.partition('#')[0]
796 req._Request__r_type = req._Request__r_type.partition('#')[0]
800 def http_response(self, req, resp):
803 if resp.headers.get('Content-encoding', '') == 'gzip':
804 content = resp.read()
805 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
807 uncompressed = io.BytesIO(gz.read())
808 except IOError as original_ioerror:
809 # There may be junk add the end of the file
810 # See http://stackoverflow.com/q/4928560/35070 for details
811 for i in range(1, 1024):
813 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
814 uncompressed = io.BytesIO(gz.read())
819 raise original_ioerror
820 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
821 resp.msg = old_resp.msg
822 del resp.headers['Content-encoding']
824 if resp.headers.get('Content-encoding', '') == 'deflate':
825 gz = io.BytesIO(self.deflate(resp.read()))
826 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
827 resp.msg = old_resp.msg
828 del resp.headers['Content-encoding']
829 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
830 # https://github.com/rg3/youtube-dl/issues/6457).
831 if 300 <= resp.code < 400:
832 location = resp.headers.get('Location')
834 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
835 if sys.version_info >= (3, 0):
836 location = location.encode('iso-8859-1').decode('utf-8')
837 location_escaped = escape_url(location)
838 if location != location_escaped:
839 del resp.headers['Location']
840 resp.headers['Location'] = location_escaped
843 https_request = http_request
844 https_response = http_response
847 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
848 def __init__(self, params, https_conn_class=None, *args, **kwargs):
849 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
850 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
851 self._params = params
853 def https_open(self, req):
855 if hasattr(self, '_context'): # python > 2.6
856 kwargs['context'] = self._context
857 if hasattr(self, '_check_hostname'): # python 3.x
858 kwargs['check_hostname'] = self._check_hostname
859 return self.do_open(functools.partial(
860 _create_http_connection, self, self._https_conn_class, True),
864 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
865 def __init__(self, cookiejar=None):
866 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
868 def http_response(self, request, response):
869 # Python 2 will choke on next HTTP request in row if there are non-ASCII
870 # characters in Set-Cookie HTTP header of last response (see
871 # https://github.com/rg3/youtube-dl/issues/6769).
872 # In order to at least prevent crashing we will percent encode Set-Cookie
873 # header before HTTPCookieProcessor starts processing it.
874 # if sys.version_info < (3, 0) and response.headers:
875 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
876 # set_cookie = response.headers.get(set_cookie_header)
878 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
879 # if set_cookie != set_cookie_escaped:
880 # del response.headers[set_cookie_header]
881 # response.headers[set_cookie_header] = set_cookie_escaped
882 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
884 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
885 https_response = http_response
888 def parse_iso8601(date_str, delimiter='T', timezone=None):
889 """ Return a UNIX timestamp from the given date """
894 date_str = re.sub(r'\.[0-9]+', '', date_str)
898 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
901 timezone = datetime.timedelta()
903 date_str = date_str[:-len(m.group(0))]
904 if not m.group('sign'):
905 timezone = datetime.timedelta()
907 sign = 1 if m.group('sign') == '+' else -1
908 timezone = datetime.timedelta(
909 hours=sign * int(m.group('hours')),
910 minutes=sign * int(m.group('minutes')))
912 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
913 dt = datetime.datetime.strptime(date_str, date_format) - timezone
914 return calendar.timegm(dt.timetuple())
919 def unified_strdate(date_str, day_first=True):
920 """Return a string with the date in the format YYYYMMDD"""
926 date_str = date_str.replace(',', ' ')
927 # %z (UTC offset) is only supported in python>=3.2
928 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
929 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
930 # Remove AM/PM + timezone
931 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
933 format_expressions = [
946 '%Y-%m-%d %H:%M:%S.%f',
949 '%Y-%m-%dT%H:%M:%SZ',
950 '%Y-%m-%dT%H:%M:%S.%fZ',
951 '%Y-%m-%dT%H:%M:%S.%f0Z',
953 '%Y-%m-%dT%H:%M:%S.%f',
957 format_expressions.extend([
965 format_expressions.extend([
972 for expression in format_expressions:
974 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
977 if upload_date is None:
978 timetuple = email.utils.parsedate_tz(date_str)
980 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
981 if upload_date is not None:
982 return compat_str(upload_date)
985 def determine_ext(url, default_ext='unknown_video'):
988 guess = url.partition('?')[0].rpartition('.')[2]
989 if re.match(r'^[A-Za-z0-9]+$', guess):
991 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
992 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
993 return guess.rstrip('/')
998 def subtitles_filename(filename, sub_lang, sub_format):
999 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1002 def date_from_str(date_str):
1004 Return a datetime object from a string in the format YYYYMMDD or
1005 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1006 today = datetime.date.today()
1007 if date_str in ('now', 'today'):
1009 if date_str == 'yesterday':
1010 return today - datetime.timedelta(days=1)
1011 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1012 if match is not None:
1013 sign = match.group('sign')
1014 time = int(match.group('time'))
1017 unit = match.group('unit')
1018 # A bad approximation?
1022 elif unit == 'year':
1026 delta = datetime.timedelta(**{unit: time})
1027 return today + delta
1028 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1031 def hyphenate_date(date_str):
1033 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1034 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1035 if match is not None:
1036 return '-'.join(match.groups())
1041 class DateRange(object):
1042 """Represents a time interval between two dates"""
1044 def __init__(self, start=None, end=None):
1045 """start and end must be strings in the format accepted by date"""
1046 if start is not None:
1047 self.start = date_from_str(start)
1049 self.start = datetime.datetime.min.date()
1051 self.end = date_from_str(end)
1053 self.end = datetime.datetime.max.date()
1054 if self.start > self.end:
1055 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1059 """Returns a range that only contains the given day"""
1060 return cls(day, day)
1062 def __contains__(self, date):
1063 """Check if the date is in the range"""
1064 if not isinstance(date, datetime.date):
1065 date = date_from_str(date)
1066 return self.start <= date <= self.end
1069 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1072 def platform_name():
1073 """ Returns the platform name as a compat_str """
1074 res = platform.platform()
1075 if isinstance(res, bytes):
1076 res = res.decode(preferredencoding())
1078 assert isinstance(res, compat_str)
1082 def _windows_write_string(s, out):
1083 """ Returns True if the string was written using special methods,
1084 False if it has yet to be written out."""
1085 # Adapted from http://stackoverflow.com/a/3259271/35070
1088 import ctypes.wintypes
1096 fileno = out.fileno()
1097 except AttributeError:
1098 # If the output stream doesn't have a fileno, it's virtual
1100 except io.UnsupportedOperation:
1101 # Some strange Windows pseudo files?
1103 if fileno not in WIN_OUTPUT_IDS:
1106 GetStdHandle = ctypes.WINFUNCTYPE(
1107 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1108 (b'GetStdHandle', ctypes.windll.kernel32))
1109 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1111 WriteConsoleW = ctypes.WINFUNCTYPE(
1112 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1113 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1114 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1115 written = ctypes.wintypes.DWORD(0)
1117 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1118 FILE_TYPE_CHAR = 0x0002
1119 FILE_TYPE_REMOTE = 0x8000
1120 GetConsoleMode = ctypes.WINFUNCTYPE(
1121 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1122 ctypes.POINTER(ctypes.wintypes.DWORD))(
1123 (b'GetConsoleMode', ctypes.windll.kernel32))
1124 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1126 def not_a_console(handle):
1127 if handle == INVALID_HANDLE_VALUE or handle is None:
1129 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1130 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1132 if not_a_console(h):
1135 def next_nonbmp_pos(s):
1137 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1138 except StopIteration:
1142 count = min(next_nonbmp_pos(s), 1024)
1144 ret = WriteConsoleW(
1145 h, s, count if count else 2, ctypes.byref(written), None)
1147 raise OSError('Failed to write string')
1148 if not count: # We just wrote a non-BMP character
1149 assert written.value == 2
1152 assert written.value > 0
1153 s = s[written.value:]
1157 def write_string(s, out=None, encoding=None):
1160 assert type(s) == compat_str
1162 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1163 if _windows_write_string(s, out):
1166 if ('b' in getattr(out, 'mode', '') or
1167 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1168 byt = s.encode(encoding or preferredencoding(), 'ignore')
1170 elif hasattr(out, 'buffer'):
1171 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1172 byt = s.encode(enc, 'ignore')
1173 out.buffer.write(byt)
1179 def bytes_to_intlist(bs):
1182 if isinstance(bs[0], int): # Python 3
1185 return [ord(c) for c in bs]
1188 def intlist_to_bytes(xs):
1191 return struct_pack('%dB' % len(xs), *xs)
1194 # Cross-platform file locking
1195 if sys.platform == 'win32':
1196 import ctypes.wintypes
1199 class OVERLAPPED(ctypes.Structure):
1201 ('Internal', ctypes.wintypes.LPVOID),
1202 ('InternalHigh', ctypes.wintypes.LPVOID),
1203 ('Offset', ctypes.wintypes.DWORD),
1204 ('OffsetHigh', ctypes.wintypes.DWORD),
1205 ('hEvent', ctypes.wintypes.HANDLE),
1208 kernel32 = ctypes.windll.kernel32
1209 LockFileEx = kernel32.LockFileEx
1210 LockFileEx.argtypes = [
1211 ctypes.wintypes.HANDLE, # hFile
1212 ctypes.wintypes.DWORD, # dwFlags
1213 ctypes.wintypes.DWORD, # dwReserved
1214 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1215 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1216 ctypes.POINTER(OVERLAPPED) # Overlapped
1218 LockFileEx.restype = ctypes.wintypes.BOOL
1219 UnlockFileEx = kernel32.UnlockFileEx
1220 UnlockFileEx.argtypes = [
1221 ctypes.wintypes.HANDLE, # hFile
1222 ctypes.wintypes.DWORD, # dwReserved
1223 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1224 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1225 ctypes.POINTER(OVERLAPPED) # Overlapped
1227 UnlockFileEx.restype = ctypes.wintypes.BOOL
1228 whole_low = 0xffffffff
1229 whole_high = 0x7fffffff
1231 def _lock_file(f, exclusive):
1232 overlapped = OVERLAPPED()
1233 overlapped.Offset = 0
1234 overlapped.OffsetHigh = 0
1235 overlapped.hEvent = 0
1236 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1237 handle = msvcrt.get_osfhandle(f.fileno())
1238 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1239 whole_low, whole_high, f._lock_file_overlapped_p):
1240 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1242 def _unlock_file(f):
1243 assert f._lock_file_overlapped_p
1244 handle = msvcrt.get_osfhandle(f.fileno())
1245 if not UnlockFileEx(handle, 0,
1246 whole_low, whole_high, f._lock_file_overlapped_p):
1247 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1250 # Some platforms, such as Jython, is missing fcntl
1254 def _lock_file(f, exclusive):
1255 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1257 def _unlock_file(f):
1258 fcntl.flock(f, fcntl.LOCK_UN)
1260 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1262 def _lock_file(f, exclusive):
1263 raise IOError(UNSUPPORTED_MSG)
1265 def _unlock_file(f):
1266 raise IOError(UNSUPPORTED_MSG)
1269 class locked_file(object):
1270 def __init__(self, filename, mode, encoding=None):
1271 assert mode in ['r', 'a', 'w']
1272 self.f = io.open(filename, mode, encoding=encoding)
1275 def __enter__(self):
1276 exclusive = self.mode != 'r'
1278 _lock_file(self.f, exclusive)
1284 def __exit__(self, etype, value, traceback):
1286 _unlock_file(self.f)
1293 def write(self, *args):
1294 return self.f.write(*args)
1296 def read(self, *args):
1297 return self.f.read(*args)
1300 def get_filesystem_encoding():
1301 encoding = sys.getfilesystemencoding()
1302 return encoding if encoding is not None else 'utf-8'
1305 def shell_quote(args):
1307 encoding = get_filesystem_encoding()
1309 if isinstance(a, bytes):
1310 # We may get a filename encoded with 'encodeFilename'
1311 a = a.decode(encoding)
1312 quoted_args.append(pipes.quote(a))
1313 return ' '.join(quoted_args)
1316 def smuggle_url(url, data):
1317 """ Pass additional data in a URL for internal use. """
1319 sdata = compat_urllib_parse_urlencode(
1320 {'__youtubedl_smuggle': json.dumps(data)})
1321 return url + '#' + sdata
1324 def unsmuggle_url(smug_url, default=None):
1325 if '#__youtubedl_smuggle' not in smug_url:
1326 return smug_url, default
1327 url, _, sdata = smug_url.rpartition('#')
1328 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1329 data = json.loads(jsond)
1333 def format_bytes(bytes):
1336 if type(bytes) is str:
1337 bytes = float(bytes)
1341 exponent = int(math.log(bytes, 1024.0))
1342 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1343 converted = float(bytes) / float(1024 ** exponent)
1344 return '%.2f%s' % (converted, suffix)
1347 def lookup_unit_table(unit_table, s):
1348 units_re = '|'.join(re.escape(u) for u in unit_table)
1350 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1353 num_str = m.group('num').replace(',', '.')
1354 mult = unit_table[m.group('unit')]
1355 return int(float(num_str) * mult)
1358 def parse_filesize(s):
1362 # The lower-case forms are of course incorrect and unofficial,
1363 # but we support those too
1401 return lookup_unit_table(_UNIT_TABLE, s)
1410 if re.match(r'^[\d,.]+$', s):
1411 return str_to_int(s)
1422 return lookup_unit_table(_UNIT_TABLE, s)
1425 def month_by_name(name):
1426 """ Return the number of a month by (locale-independently) English name """
1429 return ENGLISH_MONTH_NAMES.index(name) + 1
1434 def month_by_abbreviation(abbrev):
1435 """ Return the number of a month by (locale-independently) English
1439 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1444 def fix_xml_ampersands(xml_str):
1445 """Replace all the '&' by '&' in XML"""
1447 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1452 def setproctitle(title):
1453 assert isinstance(title, compat_str)
1455 # ctypes in Jython is not complete
1456 # http://bugs.jython.org/issue2148
1457 if sys.platform.startswith('java'):
1461 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1464 title_bytes = title.encode('utf-8')
1465 buf = ctypes.create_string_buffer(len(title_bytes))
1466 buf.value = title_bytes
1468 libc.prctl(15, buf, 0, 0, 0)
1469 except AttributeError:
1470 return # Strange libc, just skip this
1473 def remove_start(s, start):
1474 if s.startswith(start):
1475 return s[len(start):]
1479 def remove_end(s, end):
1481 return s[:-len(end)]
1485 def remove_quotes(s):
1486 if s is None or len(s) < 2:
1488 for quote in ('"', "'", ):
1489 if s[0] == quote and s[-1] == quote:
1494 def url_basename(url):
1495 path = compat_urlparse.urlparse(url).path
1496 return path.strip('/').split('/')[-1]
1499 class HEADRequest(compat_urllib_request.Request):
1500 def get_method(self):
1504 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1507 v = getattr(v, get_attr, None)
1513 return int(v) * invscale // scale
1518 def str_or_none(v, default=None):
1519 return default if v is None else compat_str(v)
1522 def str_to_int(int_str):
1523 """ A more relaxed version of int_or_none """
1526 int_str = re.sub(r'[,\.\+]', '', int_str)
1530 def float_or_none(v, scale=1, invscale=1, default=None):
1534 return float(v) * invscale / scale
1539 def parse_duration(s):
1540 if not isinstance(s, compat_basestring):
1548 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1549 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1551 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1554 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1555 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1557 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1559 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1564 if m.group('only_mins'):
1565 return float_or_none(m.group('only_mins'), invscale=60)
1566 if m.group('only_hours'):
1567 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1569 res += int(m.group('secs'))
1570 if m.group('mins_reversed'):
1571 res += int(m.group('mins_reversed')) * 60
1573 res += int(m.group('mins')) * 60
1574 if m.group('hours'):
1575 res += int(m.group('hours')) * 60 * 60
1576 if m.group('hours_reversed'):
1577 res += int(m.group('hours_reversed')) * 60 * 60
1579 res += int(m.group('days')) * 24 * 60 * 60
1581 res += float(m.group('ms'))
1585 def prepend_extension(filename, ext, expected_real_ext=None):
1586 name, real_ext = os.path.splitext(filename)
1588 '{0}.{1}{2}'.format(name, ext, real_ext)
1589 if not expected_real_ext or real_ext[1:] == expected_real_ext
1590 else '{0}.{1}'.format(filename, ext))
1593 def replace_extension(filename, ext, expected_real_ext=None):
1594 name, real_ext = os.path.splitext(filename)
1595 return '{0}.{1}'.format(
1596 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1600 def check_executable(exe, args=[]):
1601 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1602 args can be a list of arguments for a short output (like -version) """
1604 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1610 def get_exe_version(exe, args=['--version'],
1611 version_re=None, unrecognized='present'):
1612 """ Returns the version of the specified executable,
1613 or False if the executable is not present """
1615 out, _ = subprocess.Popen(
1616 [encodeArgument(exe)] + args,
1617 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1620 if isinstance(out, bytes): # Python 2.x
1621 out = out.decode('ascii', 'ignore')
1622 return detect_exe_version(out, version_re, unrecognized)
1625 def detect_exe_version(output, version_re=None, unrecognized='present'):
1626 assert isinstance(output, compat_str)
1627 if version_re is None:
1628 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1629 m = re.search(version_re, output)
1636 class PagedList(object):
1638 # This is only useful for tests
1639 return len(self.getslice())
1642 class OnDemandPagedList(PagedList):
1643 def __init__(self, pagefunc, pagesize, use_cache=False):
1644 self._pagefunc = pagefunc
1645 self._pagesize = pagesize
1646 self._use_cache = use_cache
1650 def getslice(self, start=0, end=None):
1652 for pagenum in itertools.count(start // self._pagesize):
1653 firstid = pagenum * self._pagesize
1654 nextfirstid = pagenum * self._pagesize + self._pagesize
1655 if start >= nextfirstid:
1660 page_results = self._cache.get(pagenum)
1661 if page_results is None:
1662 page_results = list(self._pagefunc(pagenum))
1664 self._cache[pagenum] = page_results
1667 start % self._pagesize
1668 if firstid <= start < nextfirstid
1672 ((end - 1) % self._pagesize) + 1
1673 if (end is not None and firstid <= end <= nextfirstid)
1676 if startv != 0 or endv is not None:
1677 page_results = page_results[startv:endv]
1678 res.extend(page_results)
1680 # A little optimization - if current page is not "full", ie. does
1681 # not contain page_size videos then we can assume that this page
1682 # is the last one - there are no more ids on further pages -
1683 # i.e. no need to query again.
1684 if len(page_results) + startv < self._pagesize:
1687 # If we got the whole page, but the next page is not interesting,
1688 # break out early as well
1689 if end == nextfirstid:
1694 class InAdvancePagedList(PagedList):
1695 def __init__(self, pagefunc, pagecount, pagesize):
1696 self._pagefunc = pagefunc
1697 self._pagecount = pagecount
1698 self._pagesize = pagesize
1700 def getslice(self, start=0, end=None):
1702 start_page = start // self._pagesize
1704 self._pagecount if end is None else (end // self._pagesize + 1))
1705 skip_elems = start - start_page * self._pagesize
1706 only_more = None if end is None else end - start
1707 for pagenum in range(start_page, end_page):
1708 page = list(self._pagefunc(pagenum))
1710 page = page[skip_elems:]
1712 if only_more is not None:
1713 if len(page) < only_more:
1714 only_more -= len(page)
1716 page = page[:only_more]
1723 def uppercase_escape(s):
1724 unicode_escape = codecs.getdecoder('unicode_escape')
1726 r'\\U[0-9a-fA-F]{8}',
1727 lambda m: unicode_escape(m.group(0))[0],
1731 def lowercase_escape(s):
1732 unicode_escape = codecs.getdecoder('unicode_escape')
1734 r'\\u[0-9a-fA-F]{4}',
1735 lambda m: unicode_escape(m.group(0))[0],
1739 def escape_rfc3986(s):
1740 """Escape non-ASCII characters as suggested by RFC 3986"""
1741 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1742 s = s.encode('utf-8')
1743 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1746 def escape_url(url):
1747 """Escape URL as suggested by RFC 3986"""
1748 url_parsed = compat_urllib_parse_urlparse(url)
1749 return url_parsed._replace(
1750 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1751 path=escape_rfc3986(url_parsed.path),
1752 params=escape_rfc3986(url_parsed.params),
1753 query=escape_rfc3986(url_parsed.query),
1754 fragment=escape_rfc3986(url_parsed.fragment)
1758 struct.pack('!I', 0)
1760 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1761 # See https://bugs.python.org/issue19099
1762 def struct_pack(spec, *args):
1763 if isinstance(spec, compat_str):
1764 spec = spec.encode('ascii')
1765 return struct.pack(spec, *args)
1767 def struct_unpack(spec, *args):
1768 if isinstance(spec, compat_str):
1769 spec = spec.encode('ascii')
1770 return struct.unpack(spec, *args)
1772 struct_pack = struct.pack
1773 struct_unpack = struct.unpack
1776 def read_batch_urls(batch_fd):
1778 if not isinstance(url, compat_str):
1779 url = url.decode('utf-8', 'replace')
1780 BOM_UTF8 = '\xef\xbb\xbf'
1781 if url.startswith(BOM_UTF8):
1782 url = url[len(BOM_UTF8):]
1784 if url.startswith(('#', ';', ']')):
1788 with contextlib.closing(batch_fd) as fd:
1789 return [url for url in map(fixup, fd) if url]
1792 def urlencode_postdata(*args, **kargs):
1793 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1796 def update_url_query(url, query):
1797 parsed_url = compat_urlparse.urlparse(url)
1798 qs = compat_parse_qs(parsed_url.query)
1800 return compat_urlparse.urlunparse(parsed_url._replace(
1801 query=compat_urllib_parse_urlencode(qs, True)))
1804 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1805 if isinstance(key_or_keys, (list, tuple)):
1806 for key in key_or_keys:
1807 if key not in d or d[key] is None or skip_false_values and not d[key]:
1811 return d.get(key_or_keys, default)
1814 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1815 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1827 def parse_age_limit(s):
1830 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1831 return int(m.group('age')) if m else US_RATINGS.get(s)
1834 def strip_jsonp(code):
1836 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1839 def js_to_json(code):
1842 if v in ('true', 'false', 'null'):
1844 if v.startswith('"'):
1845 v = re.sub(r"\\'", "'", v[1:-1])
1846 elif v.startswith("'"):
1848 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1855 res = re.sub(r'''(?x)
1856 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1857 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1858 [a-zA-Z_][.a-zA-Z_0-9]*
1860 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1864 def qualities(quality_ids):
1865 """ Get a numeric quality value out of a list of possible values """
1868 return quality_ids.index(qid)
1874 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1877 def limit_length(s, length):
1878 """ Add ellipses to overly long strings """
1883 return s[:length - len(ELLIPSES)] + ELLIPSES
1887 def version_tuple(v):
1888 return tuple(int(e) for e in re.split(r'[-.]', v))
1891 def is_outdated_version(version, limit, assume_new=True):
1893 return not assume_new
1895 return version_tuple(version) < version_tuple(limit)
1897 return not assume_new
1900 def ytdl_is_updateable():
1901 """ Returns if youtube-dl can be updated with -U """
1902 from zipimport import zipimporter
1904 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1907 def args_to_str(args):
1908 # Get a short string representation for a subprocess command
1909 return ' '.join(shlex_quote(a) for a in args)
1912 def error_to_compat_str(err):
1914 # On python 2 error byte string must be decoded with proper
1915 # encoding rather than ascii
1916 if sys.version_info[0] < 3:
1917 err_str = err_str.decode(preferredencoding())
1921 def mimetype2ext(mt):
1928 _, _, res = mt.rpartition('/')
1932 'smptett+xml': 'tt',
1938 'x-mp4-fragmented': 'mp4',
1943 def urlhandle_detect_ext(url_handle):
1946 getheader = lambda h: url_handle.headers[h]
1947 except AttributeError: # Python < 3
1948 getheader = url_handle.info().getheader
1950 cd = getheader('Content-Disposition')
1952 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1954 e = determine_ext(m.group('filename'), default_ext=None)
1958 return mimetype2ext(getheader('Content-Type'))
1961 def encode_data_uri(data, mime_type):
1962 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1965 def age_restricted(content_limit, age_limit):
1966 """ Returns True iff the content should be blocked """
1968 if age_limit is None: # No limit set
1970 if content_limit is None:
1971 return False # Content available for everyone
1972 return age_limit < content_limit
1975 def is_html(first_bytes):
1976 """ Detect whether a file contains HTML by examining its first bytes. """
1979 (b'\xef\xbb\xbf', 'utf-8'),
1980 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1981 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1982 (b'\xff\xfe', 'utf-16-le'),
1983 (b'\xfe\xff', 'utf-16-be'),
1985 for bom, enc in BOMS:
1986 if first_bytes.startswith(bom):
1987 s = first_bytes[len(bom):].decode(enc, 'replace')
1990 s = first_bytes.decode('utf-8', 'replace')
1992 return re.match(r'^\s*<', s)
1995 def determine_protocol(info_dict):
1996 protocol = info_dict.get('protocol')
1997 if protocol is not None:
2000 url = info_dict['url']
2001 if url.startswith('rtmp'):
2003 elif url.startswith('mms'):
2005 elif url.startswith('rtsp'):
2008 ext = determine_ext(url)
2014 return compat_urllib_parse_urlparse(url).scheme
2017 def render_table(header_row, data):
2018 """ Render a list of rows, each as a list of values """
2019 table = [header_row] + data
2020 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2021 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2022 return '\n'.join(format_str % tuple(row) for row in table)
2025 def _match_one(filter_part, dct):
2026 COMPARISON_OPERATORS = {
2034 operator_rex = re.compile(r'''(?x)\s*
2036 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2038 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2039 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2042 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2043 m = operator_rex.search(filter_part)
2045 op = COMPARISON_OPERATORS[m.group('op')]
2046 if m.group('strval') is not None:
2047 if m.group('op') not in ('=', '!='):
2049 'Operator %s does not support string values!' % m.group('op'))
2050 comparison_value = m.group('strval')
2053 comparison_value = int(m.group('intval'))
2055 comparison_value = parse_filesize(m.group('intval'))
2056 if comparison_value is None:
2057 comparison_value = parse_filesize(m.group('intval') + 'B')
2058 if comparison_value is None:
2060 'Invalid integer value %r in filter part %r' % (
2061 m.group('intval'), filter_part))
2062 actual_value = dct.get(m.group('key'))
2063 if actual_value is None:
2064 return m.group('none_inclusive')
2065 return op(actual_value, comparison_value)
2068 '': lambda v: v is not None,
2069 '!': lambda v: v is None,
2071 operator_rex = re.compile(r'''(?x)\s*
2072 (?P<op>%s)\s*(?P<key>[a-z_]+)
2074 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2075 m = operator_rex.search(filter_part)
2077 op = UNARY_OPERATORS[m.group('op')]
2078 actual_value = dct.get(m.group('key'))
2079 return op(actual_value)
2081 raise ValueError('Invalid filter part %r' % filter_part)
2084 def match_str(filter_str, dct):
2085 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2088 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2091 def match_filter_func(filter_str):
2092 def _match_func(info_dict):
2093 if match_str(filter_str, info_dict):
2096 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2097 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2101 def parse_dfxp_time_expr(time_expr):
2105 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2107 return float(mobj.group('time_offset'))
2109 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2111 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2114 def srt_subtitles_timecode(seconds):
2115 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2118 def dfxp2srt(dfxp_data):
2119 _x = functools.partial(xpath_with_ns, ns_map={
2120 'ttml': 'http://www.w3.org/ns/ttml',
2121 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2124 class TTMLPElementParser(object):
2127 def start(self, tag, attrib):
2128 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2134 def data(self, data):
2138 return self.out.strip()
2140 def parse_node(node):
2141 target = TTMLPElementParser()
2142 parser = xml.etree.ElementTree.XMLParser(target=target)
2143 parser.feed(xml.etree.ElementTree.tostring(node))
2144 return parser.close()
2146 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2148 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2151 raise ValueError('Invalid dfxp/TTML subtitle')
2153 for para, index in zip(paras, itertools.count(1)):
2154 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2155 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2156 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2157 if begin_time is None:
2162 end_time = begin_time + dur
2163 out.append('%d\n%s --> %s\n%s\n\n' % (
2165 srt_subtitles_timecode(begin_time),
2166 srt_subtitles_timecode(end_time),
2172 def cli_option(params, command_option, param):
2173 param = params.get(param)
2174 return [command_option, param] if param is not None else []
2177 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2178 param = params.get(param)
2179 assert isinstance(param, bool)
2181 return [command_option + separator + (true_value if param else false_value)]
2182 return [command_option, true_value if param else false_value]
2185 def cli_valueless_option(params, command_option, param, expected_value=True):
2186 param = params.get(param)
2187 return [command_option] if param == expected_value else []
2190 def cli_configuration_args(params, param, default=[]):
2191 ex_args = params.get(param)
2194 assert isinstance(ex_args, list)
2198 class ISO639Utils(object):
2199 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2388 def short2long(cls, code):
2389 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2390 return cls._lang_map.get(code[:2])
2393 def long2short(cls, code):
2394 """Convert language code from ISO 639-2/T to ISO 639-1"""
2395 for short_name, long_name in cls._lang_map.items():
2396 if long_name == code:
2400 class ISO3166Utils(object):
2401 # From http://data.okfn.org/data/core/country-list
2403 'AF': 'Afghanistan',
2404 'AX': 'Ã…land Islands',
2407 'AS': 'American Samoa',
2412 'AG': 'Antigua and Barbuda',
2429 'BO': 'Bolivia, Plurinational State of',
2430 'BQ': 'Bonaire, Sint Eustatius and Saba',
2431 'BA': 'Bosnia and Herzegovina',
2433 'BV': 'Bouvet Island',
2435 'IO': 'British Indian Ocean Territory',
2436 'BN': 'Brunei Darussalam',
2438 'BF': 'Burkina Faso',
2444 'KY': 'Cayman Islands',
2445 'CF': 'Central African Republic',
2449 'CX': 'Christmas Island',
2450 'CC': 'Cocos (Keeling) Islands',
2454 'CD': 'Congo, the Democratic Republic of the',
2455 'CK': 'Cook Islands',
2457 'CI': 'Côte d\'Ivoire',
2462 'CZ': 'Czech Republic',
2466 'DO': 'Dominican Republic',
2469 'SV': 'El Salvador',
2470 'GQ': 'Equatorial Guinea',
2474 'FK': 'Falkland Islands (Malvinas)',
2475 'FO': 'Faroe Islands',
2479 'GF': 'French Guiana',
2480 'PF': 'French Polynesia',
2481 'TF': 'French Southern Territories',
2496 'GW': 'Guinea-Bissau',
2499 'HM': 'Heard Island and McDonald Islands',
2500 'VA': 'Holy See (Vatican City State)',
2507 'IR': 'Iran, Islamic Republic of',
2510 'IM': 'Isle of Man',
2520 'KP': 'Korea, Democratic People\'s Republic of',
2521 'KR': 'Korea, Republic of',
2524 'LA': 'Lao People\'s Democratic Republic',
2530 'LI': 'Liechtenstein',
2534 'MK': 'Macedonia, the Former Yugoslav Republic of',
2541 'MH': 'Marshall Islands',
2547 'FM': 'Micronesia, Federated States of',
2548 'MD': 'Moldova, Republic of',
2559 'NL': 'Netherlands',
2560 'NC': 'New Caledonia',
2561 'NZ': 'New Zealand',
2566 'NF': 'Norfolk Island',
2567 'MP': 'Northern Mariana Islands',
2572 'PS': 'Palestine, State of',
2574 'PG': 'Papua New Guinea',
2577 'PH': 'Philippines',
2581 'PR': 'Puerto Rico',
2585 'RU': 'Russian Federation',
2587 'BL': 'Saint Barthélemy',
2588 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2589 'KN': 'Saint Kitts and Nevis',
2590 'LC': 'Saint Lucia',
2591 'MF': 'Saint Martin (French part)',
2592 'PM': 'Saint Pierre and Miquelon',
2593 'VC': 'Saint Vincent and the Grenadines',
2596 'ST': 'Sao Tome and Principe',
2597 'SA': 'Saudi Arabia',
2601 'SL': 'Sierra Leone',
2603 'SX': 'Sint Maarten (Dutch part)',
2606 'SB': 'Solomon Islands',
2608 'ZA': 'South Africa',
2609 'GS': 'South Georgia and the South Sandwich Islands',
2610 'SS': 'South Sudan',
2615 'SJ': 'Svalbard and Jan Mayen',
2618 'CH': 'Switzerland',
2619 'SY': 'Syrian Arab Republic',
2620 'TW': 'Taiwan, Province of China',
2622 'TZ': 'Tanzania, United Republic of',
2624 'TL': 'Timor-Leste',
2628 'TT': 'Trinidad and Tobago',
2631 'TM': 'Turkmenistan',
2632 'TC': 'Turks and Caicos Islands',
2636 'AE': 'United Arab Emirates',
2637 'GB': 'United Kingdom',
2638 'US': 'United States',
2639 'UM': 'United States Minor Outlying Islands',
2643 'VE': 'Venezuela, Bolivarian Republic of',
2645 'VG': 'Virgin Islands, British',
2646 'VI': 'Virgin Islands, U.S.',
2647 'WF': 'Wallis and Futuna',
2648 'EH': 'Western Sahara',
2655 def short2full(cls, code):
2656 """Convert an ISO 3166-2 country code to the corresponding full name"""
2657 return cls._country_map.get(code.upper())
2660 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2661 def __init__(self, proxies=None):
2662 # Set default handlers
2663 for type in ('http', 'https'):
2664 setattr(self, '%s_open' % type,
2665 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2666 meth(r, proxy, type))
2667 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2669 def proxy_open(self, req, proxy, type):
2670 req_proxy = req.headers.get('Ytdl-request-proxy')
2671 if req_proxy is not None:
2673 del req.headers['Ytdl-request-proxy']
2675 if proxy == '__noproxy__':
2676 return None # No Proxy
2677 return compat_urllib_request.ProxyHandler.proxy_open(
2678 self, req, proxy, type)
2681 def ohdave_rsa_encrypt(data, exponent, modulus):
2683 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2686 data: data to encrypt, bytes-like object
2687 exponent, modulus: parameter e and N of RSA algorithm, both integer
2688 Output: hex string of encrypted data
2690 Limitation: supports one block encryption only
2693 payload = int(binascii.hexlify(data[::-1]), 16)
2694 encrypted = pow(payload, exponent, modulus)
2695 return '%x' % encrypted
2698 def encode_base_n(num, n, table=None):
2699 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2701 table = FULL_TABLE[:n]
2704 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2711 ret = table[num % n] + ret
2716 def decode_packed_codes(code):
2718 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2720 obfucasted_code, base, count, symbols = mobj.groups()
2723 symbols = symbols.split('|')
2728 base_n_count = encode_base_n(count, base)
2729 symbol_table[base_n_count] = symbols[count] or base_n_count
2732 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],