2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
41 compat_socket_create_connection,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
76 pref = locale.getpreferredencoding()
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
117 tf = tempfile.NamedTemporaryFile(**args)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
129 os.rename(tf.name, fn)
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
166 replaced.append(c[0])
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
178 if n is None or n.text is None:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
195 m = re.search(r'''(?xs)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 ''' % (re.escape(attribute), re.escape(value)), html)
207 res = m.group('content')
209 if res.startswith('"') or res.startswith("'"):
212 return unescapeHTML(res)
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
218 if html is None: # Convenience for sanitizing descriptions etc.
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = sanitize_path(filename)
256 if alt_filename == filename:
259 # An exception here should be caught in the caller
260 stream = open(encodeFilename(alt_filename), open_mode)
261 return (stream, alt_filename)
264 def timeconvert(timestr):
265 """Convert RFC 2822 defined time string into system timestamp"""
267 timetuple = email.utils.parsedate_tz(timestr)
268 if timetuple is not None:
269 timestamp = email.utils.mktime_tz(timetuple)
273 def sanitize_filename(s, restricted=False, is_id=False):
274 """Sanitizes a string so it could be used as part of a filename.
275 If restricted is set, use a stricter subset of allowed characters.
276 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
278 def replace_insane(char):
279 if char == '?' or ord(char) < 32 or ord(char) == 127:
282 return '' if restricted else '\''
284 return '_-' if restricted else ' -'
285 elif char in '\\/|*<>':
287 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
289 if restricted and ord(char) > 127:
294 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
295 result = ''.join(map(replace_insane, s))
297 while '__' in result:
298 result = result.replace('__', '_')
299 result = result.strip('_')
300 # Common case of "Foreign band name - English song title"
301 if restricted and result.startswith('-_'):
303 if result.startswith('-'):
304 result = '_' + result[len('-'):]
305 result = result.lstrip('.')
311 def sanitize_path(s):
312 """Sanitizes and normalizes path on Windows"""
313 if sys.platform != 'win32':
315 drive, _ = os.path.splitdrive(s)
316 unc, _ = os.path.splitunc(s)
317 unc_or_drive = unc or drive
318 norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
322 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
323 for path_part in norm_path]
325 sanitized_path.insert(0, unc_or_drive + os.path.sep)
326 return os.path.join(*sanitized_path)
329 def sanitize_url_path_consecutive_slashes(url):
330 """Collapses consecutive slashes in URLs' path"""
331 parsed_url = list(compat_urlparse.urlparse(url))
332 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
333 return compat_urlparse.urlunparse(parsed_url)
336 def orderedSet(iterable):
337 """ Remove all duplicates from the input iterable """
345 def _htmlentity_transform(entity):
346 """Transforms an HTML entity to a character."""
347 # Known non-numeric HTML entity
348 if entity in compat_html_entities.name2codepoint:
349 return compat_chr(compat_html_entities.name2codepoint[entity])
351 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
353 numstr = mobj.group(1)
354 if numstr.startswith('x'):
356 numstr = '0%s' % numstr
359 return compat_chr(int(numstr, base))
361 # Unknown entity in name, return its literal representation
362 return ('&%s;' % entity)
368 assert type(s) == compat_str
371 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
374 def encodeFilename(s, for_subprocess=False):
376 @param s The name of the file
379 assert type(s) == compat_str
381 # Python 3 has a Unicode API
382 if sys.version_info >= (3, 0):
385 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
386 # Pass '' directly to use Unicode APIs on Windows 2000 and up
387 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
388 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
389 if not for_subprocess:
392 # For subprocess calls, encode with locale encoding
393 # Refer to http://stackoverflow.com/a/9951851/35070
394 encoding = preferredencoding()
396 encoding = sys.getfilesystemencoding()
399 return s.encode(encoding, 'ignore')
402 def encodeArgument(s):
403 if not isinstance(s, compat_str):
404 # Legacy code that uses byte strings
405 # Uncomment the following line after fixing all post processors
406 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
407 s = s.decode('ascii')
408 return encodeFilename(s, True)
411 def decodeOption(optval):
414 if isinstance(optval, bytes):
415 optval = optval.decode(preferredencoding())
417 assert isinstance(optval, compat_str)
421 def formatSeconds(secs):
423 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
425 return '%d:%02d' % (secs // 60, secs % 60)
430 def make_HTTPS_handler(params, **kwargs):
431 opts_no_check_certificate = params.get('nocheckcertificate', False)
432 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
433 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
434 if opts_no_check_certificate:
435 context.check_hostname = False
436 context.verify_mode = ssl.CERT_NONE
438 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
441 # (create_default_context present but HTTPSHandler has no context=)
444 if sys.version_info < (3, 2):
445 return YoutubeDLHTTPSHandler(params, **kwargs)
447 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
448 context.verify_mode = (ssl.CERT_NONE
449 if opts_no_check_certificate
450 else ssl.CERT_REQUIRED)
451 context.set_default_verify_paths()
452 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
455 class ExtractorError(Exception):
456 """Error during info extraction."""
458 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
459 """ tb, if given, is the original traceback (so that it can be printed out).
460 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
463 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
465 if video_id is not None:
466 msg = video_id + ': ' + msg
468 msg += ' (caused by %r)' % cause
470 if ytdl_is_updateable():
471 update_cmd = 'type youtube-dl -U to update'
473 update_cmd = 'see https://yt-dl.org/update on how to update'
474 msg += '; please report this issue on https://yt-dl.org/bug .'
475 msg += ' Make sure you are using the latest version; %s.' % update_cmd
476 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
477 super(ExtractorError, self).__init__(msg)
480 self.exc_info = sys.exc_info() # preserve original exception
482 self.video_id = video_id
484 def format_traceback(self):
485 if self.traceback is None:
487 return ''.join(traceback.format_tb(self.traceback))
490 class UnsupportedError(ExtractorError):
491 def __init__(self, url):
492 super(UnsupportedError, self).__init__(
493 'Unsupported URL: %s' % url, expected=True)
497 class RegexNotFoundError(ExtractorError):
498 """Error when a regex didn't match"""
502 class DownloadError(Exception):
503 """Download Error exception.
505 This exception may be thrown by FileDownloader objects if they are not
506 configured to continue on errors. They will contain the appropriate
510 def __init__(self, msg, exc_info=None):
511 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
512 super(DownloadError, self).__init__(msg)
513 self.exc_info = exc_info
516 class SameFileError(Exception):
517 """Same File exception.
519 This exception will be thrown by FileDownloader objects if they detect
520 multiple files would have to be downloaded to the same file on disk.
525 class PostProcessingError(Exception):
526 """Post Processing exception.
528 This exception may be raised by PostProcessor's .run() method to
529 indicate an error in the postprocessing task.
532 def __init__(self, msg):
536 class MaxDownloadsReached(Exception):
537 """ --max-downloads limit has been reached. """
541 class UnavailableVideoError(Exception):
542 """Unavailable Format exception.
544 This exception will be thrown when a video is requested
545 in a format that is not available for that video.
550 class ContentTooShortError(Exception):
551 """Content Too Short exception.
553 This exception may be raised by FileDownloader objects when a file they
554 download is too small for what the server announced first, indicating
555 the connection was probably interrupted.
561 def __init__(self, downloaded, expected):
562 self.downloaded = downloaded
563 self.expected = expected
566 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
567 hc = http_class(*args, **kwargs)
568 source_address = ydl_handler._params.get('source_address')
569 if source_address is not None:
570 sa = (source_address, 0)
571 if hasattr(hc, 'source_address'): # Python 2.7+
572 hc.source_address = sa
574 def _hc_connect(self, *args, **kwargs):
575 sock = compat_socket_create_connection(
576 (self.host, self.port), self.timeout, sa)
578 self.sock = ssl.wrap_socket(
579 sock, self.key_file, self.cert_file,
580 ssl_version=ssl.PROTOCOL_TLSv1)
583 hc.connect = functools.partial(_hc_connect, hc)
588 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
589 """Handler for HTTP requests and responses.
591 This class, when installed with an OpenerDirector, automatically adds
592 the standard headers to every HTTP request and handles gzipped and
593 deflated responses from web servers. If compression is to be avoided in
594 a particular request, the original request in the program code only has
595 to include the HTTP header "Youtubedl-No-Compression", which will be
596 removed before making the real request.
598 Part of this code was copied from:
600 http://techknack.net/python-urllib2-handlers/
602 Andrew Rowls, the author of that code, agreed to release it to the
606 def __init__(self, params, *args, **kwargs):
607 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
608 self._params = params
610 def http_open(self, req):
611 return self.do_open(functools.partial(
612 _create_http_connection, self, compat_http_client.HTTPConnection, False),
618 return zlib.decompress(data, -zlib.MAX_WBITS)
620 return zlib.decompress(data)
623 def addinfourl_wrapper(stream, headers, url, code):
624 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
625 return compat_urllib_request.addinfourl(stream, headers, url, code)
626 ret = compat_urllib_request.addinfourl(stream, headers, url)
630 def http_request(self, req):
631 for h, v in std_headers.items():
632 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
633 # The dict keys are capitalized because of this bug by urllib
634 if h.capitalize() not in req.headers:
636 if 'Youtubedl-no-compression' in req.headers:
637 if 'Accept-encoding' in req.headers:
638 del req.headers['Accept-encoding']
639 del req.headers['Youtubedl-no-compression']
641 if sys.version_info < (2, 7) and '#' in req.get_full_url():
642 # Python 2.6 is brain-dead when it comes to fragments
643 req._Request__original = req._Request__original.partition('#')[0]
644 req._Request__r_type = req._Request__r_type.partition('#')[0]
648 def http_response(self, req, resp):
651 if resp.headers.get('Content-encoding', '') == 'gzip':
652 content = resp.read()
653 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
655 uncompressed = io.BytesIO(gz.read())
656 except IOError as original_ioerror:
657 # There may be junk add the end of the file
658 # See http://stackoverflow.com/q/4928560/35070 for details
659 for i in range(1, 1024):
661 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
662 uncompressed = io.BytesIO(gz.read())
667 raise original_ioerror
668 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
669 resp.msg = old_resp.msg
671 if resp.headers.get('Content-encoding', '') == 'deflate':
672 gz = io.BytesIO(self.deflate(resp.read()))
673 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
674 resp.msg = old_resp.msg
677 https_request = http_request
678 https_response = http_response
681 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
682 def __init__(self, params, https_conn_class=None, *args, **kwargs):
683 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
684 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
685 self._params = params
687 def https_open(self, req):
689 if hasattr(self, '_context'): # python > 2.6
690 kwargs['context'] = self._context
691 if hasattr(self, '_check_hostname'): # python 3.x
692 kwargs['check_hostname'] = self._check_hostname
693 return self.do_open(functools.partial(
694 _create_http_connection, self, self._https_conn_class, True),
698 def parse_iso8601(date_str, delimiter='T', timezone=None):
699 """ Return a UNIX timestamp from the given date """
706 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
709 timezone = datetime.timedelta()
711 date_str = date_str[:-len(m.group(0))]
712 if not m.group('sign'):
713 timezone = datetime.timedelta()
715 sign = 1 if m.group('sign') == '+' else -1
716 timezone = datetime.timedelta(
717 hours=sign * int(m.group('hours')),
718 minutes=sign * int(m.group('minutes')))
719 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
720 dt = datetime.datetime.strptime(date_str, date_format) - timezone
721 return calendar.timegm(dt.timetuple())
724 def unified_strdate(date_str, day_first=True):
725 """Return a string with the date in the format YYYYMMDD"""
731 date_str = date_str.replace(',', ' ')
732 # %z (UTC offset) is only supported in python>=3.2
733 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
734 # Remove AM/PM + timezone
735 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
737 format_expressions = [
742 '%b %dst %Y %I:%M%p',
743 '%b %dnd %Y %I:%M%p',
744 '%b %dth %Y %I:%M%p',
750 '%Y-%m-%d %H:%M:%S.%f',
753 '%Y-%m-%dT%H:%M:%SZ',
754 '%Y-%m-%dT%H:%M:%S.%fZ',
755 '%Y-%m-%dT%H:%M:%S.%f0Z',
757 '%Y-%m-%dT%H:%M:%S.%f',
761 format_expressions.extend([
768 format_expressions.extend([
774 for expression in format_expressions:
776 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
779 if upload_date is None:
780 timetuple = email.utils.parsedate_tz(date_str)
782 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
786 def determine_ext(url, default_ext='unknown_video'):
789 guess = url.partition('?')[0].rpartition('.')[2]
790 if re.match(r'^[A-Za-z0-9]+$', guess):
796 def subtitles_filename(filename, sub_lang, sub_format):
797 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
800 def date_from_str(date_str):
802 Return a datetime object from a string in the format YYYYMMDD or
803 (now|today)[+-][0-9](day|week|month|year)(s)?"""
804 today = datetime.date.today()
805 if date_str in ('now', 'today'):
807 if date_str == 'yesterday':
808 return today - datetime.timedelta(days=1)
809 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
810 if match is not None:
811 sign = match.group('sign')
812 time = int(match.group('time'))
815 unit = match.group('unit')
816 # A bad aproximation?
824 delta = datetime.timedelta(**{unit: time})
826 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
829 def hyphenate_date(date_str):
831 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
832 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
833 if match is not None:
834 return '-'.join(match.groups())
839 class DateRange(object):
840 """Represents a time interval between two dates"""
842 def __init__(self, start=None, end=None):
843 """start and end must be strings in the format accepted by date"""
844 if start is not None:
845 self.start = date_from_str(start)
847 self.start = datetime.datetime.min.date()
849 self.end = date_from_str(end)
851 self.end = datetime.datetime.max.date()
852 if self.start > self.end:
853 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
857 """Returns a range that only contains the given day"""
860 def __contains__(self, date):
861 """Check if the date is in the range"""
862 if not isinstance(date, datetime.date):
863 date = date_from_str(date)
864 return self.start <= date <= self.end
867 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
871 """ Returns the platform name as a compat_str """
872 res = platform.platform()
873 if isinstance(res, bytes):
874 res = res.decode(preferredencoding())
876 assert isinstance(res, compat_str)
880 def _windows_write_string(s, out):
881 """ Returns True if the string was written using special methods,
882 False if it has yet to be written out."""
883 # Adapted from http://stackoverflow.com/a/3259271/35070
886 import ctypes.wintypes
894 fileno = out.fileno()
895 except AttributeError:
896 # If the output stream doesn't have a fileno, it's virtual
898 except io.UnsupportedOperation:
899 # Some strange Windows pseudo files?
901 if fileno not in WIN_OUTPUT_IDS:
904 GetStdHandle = ctypes.WINFUNCTYPE(
905 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
906 (b"GetStdHandle", ctypes.windll.kernel32))
907 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
909 WriteConsoleW = ctypes.WINFUNCTYPE(
910 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
911 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
912 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
913 written = ctypes.wintypes.DWORD(0)
915 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
916 FILE_TYPE_CHAR = 0x0002
917 FILE_TYPE_REMOTE = 0x8000
918 GetConsoleMode = ctypes.WINFUNCTYPE(
919 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
920 ctypes.POINTER(ctypes.wintypes.DWORD))(
921 (b"GetConsoleMode", ctypes.windll.kernel32))
922 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
924 def not_a_console(handle):
925 if handle == INVALID_HANDLE_VALUE or handle is None:
927 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
928 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
933 def next_nonbmp_pos(s):
935 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
936 except StopIteration:
940 count = min(next_nonbmp_pos(s), 1024)
943 h, s, count if count else 2, ctypes.byref(written), None)
945 raise OSError('Failed to write string')
946 if not count: # We just wrote a non-BMP character
947 assert written.value == 2
950 assert written.value > 0
951 s = s[written.value:]
955 def write_string(s, out=None, encoding=None):
958 assert type(s) == compat_str
960 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
961 if _windows_write_string(s, out):
964 if ('b' in getattr(out, 'mode', '') or
965 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
966 byt = s.encode(encoding or preferredencoding(), 'ignore')
968 elif hasattr(out, 'buffer'):
969 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
970 byt = s.encode(enc, 'ignore')
971 out.buffer.write(byt)
977 def bytes_to_intlist(bs):
980 if isinstance(bs[0], int): # Python 3
983 return [ord(c) for c in bs]
986 def intlist_to_bytes(xs):
989 return struct_pack('%dB' % len(xs), *xs)
992 # Cross-platform file locking
993 if sys.platform == 'win32':
994 import ctypes.wintypes
997 class OVERLAPPED(ctypes.Structure):
999 ('Internal', ctypes.wintypes.LPVOID),
1000 ('InternalHigh', ctypes.wintypes.LPVOID),
1001 ('Offset', ctypes.wintypes.DWORD),
1002 ('OffsetHigh', ctypes.wintypes.DWORD),
1003 ('hEvent', ctypes.wintypes.HANDLE),
1006 kernel32 = ctypes.windll.kernel32
1007 LockFileEx = kernel32.LockFileEx
1008 LockFileEx.argtypes = [
1009 ctypes.wintypes.HANDLE, # hFile
1010 ctypes.wintypes.DWORD, # dwFlags
1011 ctypes.wintypes.DWORD, # dwReserved
1012 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1013 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1014 ctypes.POINTER(OVERLAPPED) # Overlapped
1016 LockFileEx.restype = ctypes.wintypes.BOOL
1017 UnlockFileEx = kernel32.UnlockFileEx
1018 UnlockFileEx.argtypes = [
1019 ctypes.wintypes.HANDLE, # hFile
1020 ctypes.wintypes.DWORD, # dwReserved
1021 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1022 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1023 ctypes.POINTER(OVERLAPPED) # Overlapped
1025 UnlockFileEx.restype = ctypes.wintypes.BOOL
1026 whole_low = 0xffffffff
1027 whole_high = 0x7fffffff
1029 def _lock_file(f, exclusive):
1030 overlapped = OVERLAPPED()
1031 overlapped.Offset = 0
1032 overlapped.OffsetHigh = 0
1033 overlapped.hEvent = 0
1034 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1035 handle = msvcrt.get_osfhandle(f.fileno())
1036 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1037 whole_low, whole_high, f._lock_file_overlapped_p):
1038 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1040 def _unlock_file(f):
1041 assert f._lock_file_overlapped_p
1042 handle = msvcrt.get_osfhandle(f.fileno())
1043 if not UnlockFileEx(handle, 0,
1044 whole_low, whole_high, f._lock_file_overlapped_p):
1045 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1050 def _lock_file(f, exclusive):
1051 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1053 def _unlock_file(f):
1054 fcntl.flock(f, fcntl.LOCK_UN)
1057 class locked_file(object):
1058 def __init__(self, filename, mode, encoding=None):
1059 assert mode in ['r', 'a', 'w']
1060 self.f = io.open(filename, mode, encoding=encoding)
1063 def __enter__(self):
1064 exclusive = self.mode != 'r'
1066 _lock_file(self.f, exclusive)
1072 def __exit__(self, etype, value, traceback):
1074 _unlock_file(self.f)
1081 def write(self, *args):
1082 return self.f.write(*args)
1084 def read(self, *args):
1085 return self.f.read(*args)
1088 def get_filesystem_encoding():
1089 encoding = sys.getfilesystemencoding()
1090 return encoding if encoding is not None else 'utf-8'
1093 def shell_quote(args):
1095 encoding = get_filesystem_encoding()
1097 if isinstance(a, bytes):
1098 # We may get a filename encoded with 'encodeFilename'
1099 a = a.decode(encoding)
1100 quoted_args.append(pipes.quote(a))
1101 return ' '.join(quoted_args)
1104 def takewhile_inclusive(pred, seq):
1105 """ Like itertools.takewhile, but include the latest evaluated element
1106 (the first element so that Not pred(e)) """
1113 def smuggle_url(url, data):
1114 """ Pass additional data in a URL for internal use. """
1116 sdata = compat_urllib_parse.urlencode(
1117 {'__youtubedl_smuggle': json.dumps(data)})
1118 return url + '#' + sdata
1121 def unsmuggle_url(smug_url, default=None):
1122 if '#__youtubedl_smuggle' not in smug_url:
1123 return smug_url, default
1124 url, _, sdata = smug_url.rpartition('#')
1125 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1126 data = json.loads(jsond)
1130 def format_bytes(bytes):
1133 if type(bytes) is str:
1134 bytes = float(bytes)
1138 exponent = int(math.log(bytes, 1024.0))
1139 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1140 converted = float(bytes) / float(1024 ** exponent)
1141 return '%.2f%s' % (converted, suffix)
1144 def parse_filesize(s):
1148 # The lower-case forms are of course incorrect and inofficial,
1149 # but we support those too
1187 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1189 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1193 num_str = m.group('num').replace(',', '.')
1194 mult = _UNIT_TABLE[m.group('unit')]
1195 return int(float(num_str) * mult)
1198 def month_by_name(name):
1199 """ Return the number of a month by (locale-independently) English name """
1202 return ENGLISH_MONTH_NAMES.index(name) + 1
1207 def month_by_abbreviation(abbrev):
1208 """ Return the number of a month by (locale-independently) English
1212 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1217 def fix_xml_ampersands(xml_str):
1218 """Replace all the '&' by '&' in XML"""
1220 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1225 def setproctitle(title):
1226 assert isinstance(title, compat_str)
1228 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1231 title_bytes = title.encode('utf-8')
1232 buf = ctypes.create_string_buffer(len(title_bytes))
1233 buf.value = title_bytes
1235 libc.prctl(15, buf, 0, 0, 0)
1236 except AttributeError:
1237 return # Strange libc, just skip this
1240 def remove_start(s, start):
1241 if s.startswith(start):
1242 return s[len(start):]
1246 def remove_end(s, end):
1248 return s[:-len(end)]
1252 def url_basename(url):
1253 path = compat_urlparse.urlparse(url).path
1254 return path.strip('/').split('/')[-1]
1257 class HEADRequest(compat_urllib_request.Request):
1258 def get_method(self):
1262 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1265 v = getattr(v, get_attr, None)
1268 return default if v is None else (int(v) * invscale // scale)
1271 def str_or_none(v, default=None):
1272 return default if v is None else compat_str(v)
1275 def str_to_int(int_str):
1276 """ A more relaxed version of int_or_none """
1279 int_str = re.sub(r'[,\.\+]', '', int_str)
1283 def float_or_none(v, scale=1, invscale=1, default=None):
1284 return default if v is None else (float(v) * invscale / scale)
1287 def parse_duration(s):
1288 if not isinstance(s, compat_basestring):
1296 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1297 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1299 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1302 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1303 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1305 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1307 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1312 if m.group('only_mins'):
1313 return float_or_none(m.group('only_mins'), invscale=60)
1314 if m.group('only_hours'):
1315 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1317 res += int(m.group('secs'))
1318 if m.group('mins_reversed'):
1319 res += int(m.group('mins_reversed')) * 60
1321 res += int(m.group('mins')) * 60
1322 if m.group('hours'):
1323 res += int(m.group('hours')) * 60 * 60
1324 if m.group('hours_reversed'):
1325 res += int(m.group('hours_reversed')) * 60 * 60
1327 res += int(m.group('days')) * 24 * 60 * 60
1329 res += float(m.group('ms'))
1333 def prepend_extension(filename, ext):
1334 name, real_ext = os.path.splitext(filename)
1335 return '{0}.{1}{2}'.format(name, ext, real_ext)
1338 def check_executable(exe, args=[]):
1339 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1340 args can be a list of arguments for a short output (like -version) """
1342 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1348 def get_exe_version(exe, args=['--version'],
1349 version_re=None, unrecognized='present'):
1350 """ Returns the version of the specified executable,
1351 or False if the executable is not present """
1353 out, _ = subprocess.Popen(
1355 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1358 if isinstance(out, bytes): # Python 2.x
1359 out = out.decode('ascii', 'ignore')
1360 return detect_exe_version(out, version_re, unrecognized)
1363 def detect_exe_version(output, version_re=None, unrecognized='present'):
1364 assert isinstance(output, compat_str)
1365 if version_re is None:
1366 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1367 m = re.search(version_re, output)
1374 class PagedList(object):
1376 # This is only useful for tests
1377 return len(self.getslice())
1380 class OnDemandPagedList(PagedList):
1381 def __init__(self, pagefunc, pagesize):
1382 self._pagefunc = pagefunc
1383 self._pagesize = pagesize
1385 def getslice(self, start=0, end=None):
1387 for pagenum in itertools.count(start // self._pagesize):
1388 firstid = pagenum * self._pagesize
1389 nextfirstid = pagenum * self._pagesize + self._pagesize
1390 if start >= nextfirstid:
1393 page_results = list(self._pagefunc(pagenum))
1396 start % self._pagesize
1397 if firstid <= start < nextfirstid
1401 ((end - 1) % self._pagesize) + 1
1402 if (end is not None and firstid <= end <= nextfirstid)
1405 if startv != 0 or endv is not None:
1406 page_results = page_results[startv:endv]
1407 res.extend(page_results)
1409 # A little optimization - if current page is not "full", ie. does
1410 # not contain page_size videos then we can assume that this page
1411 # is the last one - there are no more ids on further pages -
1412 # i.e. no need to query again.
1413 if len(page_results) + startv < self._pagesize:
1416 # If we got the whole page, but the next page is not interesting,
1417 # break out early as well
1418 if end == nextfirstid:
1423 class InAdvancePagedList(PagedList):
1424 def __init__(self, pagefunc, pagecount, pagesize):
1425 self._pagefunc = pagefunc
1426 self._pagecount = pagecount
1427 self._pagesize = pagesize
1429 def getslice(self, start=0, end=None):
1431 start_page = start // self._pagesize
1433 self._pagecount if end is None else (end // self._pagesize + 1))
1434 skip_elems = start - start_page * self._pagesize
1435 only_more = None if end is None else end - start
1436 for pagenum in range(start_page, end_page):
1437 page = list(self._pagefunc(pagenum))
1439 page = page[skip_elems:]
1441 if only_more is not None:
1442 if len(page) < only_more:
1443 only_more -= len(page)
1445 page = page[:only_more]
1452 def uppercase_escape(s):
1453 unicode_escape = codecs.getdecoder('unicode_escape')
1455 r'\\U[0-9a-fA-F]{8}',
1456 lambda m: unicode_escape(m.group(0))[0],
1460 def escape_rfc3986(s):
1461 """Escape non-ASCII characters as suggested by RFC 3986"""
1462 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1463 s = s.encode('utf-8')
1464 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1467 def escape_url(url):
1468 """Escape URL as suggested by RFC 3986"""
1469 url_parsed = compat_urllib_parse_urlparse(url)
1470 return url_parsed._replace(
1471 path=escape_rfc3986(url_parsed.path),
1472 params=escape_rfc3986(url_parsed.params),
1473 query=escape_rfc3986(url_parsed.query),
1474 fragment=escape_rfc3986(url_parsed.fragment)
1478 struct.pack('!I', 0)
1480 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1481 def struct_pack(spec, *args):
1482 if isinstance(spec, compat_str):
1483 spec = spec.encode('ascii')
1484 return struct.pack(spec, *args)
1486 def struct_unpack(spec, *args):
1487 if isinstance(spec, compat_str):
1488 spec = spec.encode('ascii')
1489 return struct.unpack(spec, *args)
1491 struct_pack = struct.pack
1492 struct_unpack = struct.unpack
1495 def read_batch_urls(batch_fd):
1497 if not isinstance(url, compat_str):
1498 url = url.decode('utf-8', 'replace')
1499 BOM_UTF8 = '\xef\xbb\xbf'
1500 if url.startswith(BOM_UTF8):
1501 url = url[len(BOM_UTF8):]
1503 if url.startswith(('#', ';', ']')):
1507 with contextlib.closing(batch_fd) as fd:
1508 return [url for url in map(fixup, fd) if url]
1511 def urlencode_postdata(*args, **kargs):
1512 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1516 etree_iter = xml.etree.ElementTree.Element.iter
1517 except AttributeError: # Python <=2.6
1518 etree_iter = lambda n: n.findall('.//*')
1522 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1523 def doctype(self, name, pubid, system):
1524 pass # Ignore doctypes
1526 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1527 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1528 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1529 # Fix up XML parser in Python 2.x
1530 if sys.version_info < (3, 0):
1531 for n in etree_iter(tree):
1532 if n.text is not None:
1533 if not isinstance(n.text, compat_str):
1534 n.text = n.text.decode('utf-8')
1547 def parse_age_limit(s):
1550 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1551 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1554 def strip_jsonp(code):
1556 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1559 def js_to_json(code):
1562 if v in ('true', 'false', 'null'):
1564 if v.startswith('"'):
1566 if v.startswith("'"):
1568 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1575 res = re.sub(r'''(?x)
1576 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1577 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1578 [a-zA-Z_][.a-zA-Z_0-9]*
1580 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1584 def qualities(quality_ids):
1585 """ Get a numeric quality value out of a list of possible values """
1588 return quality_ids.index(qid)
1594 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1597 def limit_length(s, length):
1598 """ Add ellipses to overly long strings """
1603 return s[:length - len(ELLIPSES)] + ELLIPSES
1607 def version_tuple(v):
1608 return tuple(int(e) for e in re.split(r'[-.]', v))
1611 def is_outdated_version(version, limit, assume_new=True):
1613 return not assume_new
1615 return version_tuple(version) < version_tuple(limit)
1617 return not assume_new
1620 def ytdl_is_updateable():
1621 """ Returns if youtube-dl can be updated with -U """
1622 from zipimport import zipimporter
1624 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1627 def args_to_str(args):
1628 # Get a short string representation for a subprocess command
1629 return ' '.join(shlex_quote(a) for a in args)
1632 def mimetype2ext(mt):
1633 _, _, res = mt.rpartition('/')
1637 'x-mp4-fragmented': 'mp4',
1641 def urlhandle_detect_ext(url_handle):
1644 getheader = lambda h: url_handle.headers[h]
1645 except AttributeError: # Python < 3
1646 getheader = url_handle.info().getheader
1648 cd = getheader('Content-Disposition')
1650 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1652 e = determine_ext(m.group('filename'), default_ext=None)
1656 return mimetype2ext(getheader('Content-Type'))
1659 def age_restricted(content_limit, age_limit):
1660 """ Returns True iff the content should be blocked """
1662 if age_limit is None: # No limit set
1664 if content_limit is None:
1665 return False # Content available for everyone
1666 return age_limit < content_limit
1669 def is_html(first_bytes):
1670 """ Detect whether a file contains HTML by examining its first bytes. """
1673 (b'\xef\xbb\xbf', 'utf-8'),
1674 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1675 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1676 (b'\xff\xfe', 'utf-16-le'),
1677 (b'\xfe\xff', 'utf-16-be'),
1679 for bom, enc in BOMS:
1680 if first_bytes.startswith(bom):
1681 s = first_bytes[len(bom):].decode(enc, 'replace')
1684 s = first_bytes.decode('utf-8', 'replace')
1686 return re.match(r'^\s*<', s)
1689 def determine_protocol(info_dict):
1690 protocol = info_dict.get('protocol')
1691 if protocol is not None:
1694 url = info_dict['url']
1695 if url.startswith('rtmp'):
1697 elif url.startswith('mms'):
1699 elif url.startswith('rtsp'):
1702 ext = determine_ext(url)
1708 return compat_urllib_parse_urlparse(url).scheme
1711 def render_table(header_row, data):
1712 """ Render a list of rows, each as a list of values """
1713 table = [header_row] + data
1714 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1715 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1716 return '\n'.join(format_str % tuple(row) for row in table)
1719 def _match_one(filter_part, dct):
1720 COMPARISON_OPERATORS = {
1728 operator_rex = re.compile(r'''(?x)\s*
1730 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1732 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1733 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1736 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1737 m = operator_rex.search(filter_part)
1739 op = COMPARISON_OPERATORS[m.group('op')]
1740 if m.group('strval') is not None:
1741 if m.group('op') not in ('=', '!='):
1743 'Operator %s does not support string values!' % m.group('op'))
1744 comparison_value = m.group('strval')
1747 comparison_value = int(m.group('intval'))
1749 comparison_value = parse_filesize(m.group('intval'))
1750 if comparison_value is None:
1751 comparison_value = parse_filesize(m.group('intval') + 'B')
1752 if comparison_value is None:
1754 'Invalid integer value %r in filter part %r' % (
1755 m.group('intval'), filter_part))
1756 actual_value = dct.get(m.group('key'))
1757 if actual_value is None:
1758 return m.group('none_inclusive')
1759 return op(actual_value, comparison_value)
1762 '': lambda v: v is not None,
1763 '!': lambda v: v is None,
1765 operator_rex = re.compile(r'''(?x)\s*
1766 (?P<op>%s)\s*(?P<key>[a-z_]+)
1768 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1769 m = operator_rex.search(filter_part)
1771 op = UNARY_OPERATORS[m.group('op')]
1772 actual_value = dct.get(m.group('key'))
1773 return op(actual_value)
1775 raise ValueError('Invalid filter part %r' % filter_part)
1778 def match_str(filter_str, dct):
1779 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1782 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1785 def match_filter_func(filter_str):
1786 def _match_func(info_dict):
1787 if match_str(filter_str, info_dict):
1790 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1791 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1795 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1796 def __init__(self, proxies=None):
1797 # Set default handlers
1798 for type in ('http', 'https'):
1799 setattr(self, '%s_open' % type,
1800 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1801 meth(r, proxy, type))
1802 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1804 def proxy_open(self, req, proxy, type):
1805 req_proxy = req.headers.get('Ytdl-request-proxy')
1806 if req_proxy is not None:
1808 del req.headers['Ytdl-request-proxy']
1810 if proxy == '__noproxy__':
1811 return None # No Proxy
1812 return compat_urllib_request.ProxyHandler.proxy_open(
1813 self, req, proxy, type)