2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
41 compat_socket_create_connection,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
76 pref = locale.getpreferredencoding()
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
117 tf = tempfile.NamedTemporaryFile(**args)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
129 os.rename(tf.name, fn)
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
166 replaced.append(c[0])
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
178 if n is None or n.text is None:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
195 m = re.search(r'''(?xs)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 ''' % (re.escape(attribute), re.escape(value)), html)
207 res = m.group('content')
209 if res.startswith('"') or res.startswith("'"):
212 return unescapeHTML(res)
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
218 if html is None: # Convenience for sanitizing descriptions etc.
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = sanitize_path(filename)
256 if alt_filename == filename:
259 # An exception here should be caught in the caller
260 stream = open(encodeFilename(alt_filename), open_mode)
261 return (stream, alt_filename)
264 def timeconvert(timestr):
265 """Convert RFC 2822 defined time string into system timestamp"""
267 timetuple = email.utils.parsedate_tz(timestr)
268 if timetuple is not None:
269 timestamp = email.utils.mktime_tz(timetuple)
273 def sanitize_filename(s, restricted=False, is_id=False):
274 """Sanitizes a string so it could be used as part of a filename.
275 If restricted is set, use a stricter subset of allowed characters.
276 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
278 def replace_insane(char):
279 if char == '?' or ord(char) < 32 or ord(char) == 127:
282 return '' if restricted else '\''
284 return '_-' if restricted else ' -'
285 elif char in '\\/|*<>':
287 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
289 if restricted and ord(char) > 127:
294 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
295 result = ''.join(map(replace_insane, s))
297 while '__' in result:
298 result = result.replace('__', '_')
299 result = result.strip('_')
300 # Common case of "Foreign band name - English song title"
301 if restricted and result.startswith('-_'):
303 if result.startswith('-'):
304 result = '_' + result[len('-'):]
305 result = result.lstrip('.')
311 def sanitize_path(s):
312 """Sanitizes and normalizes path on Windows"""
313 if sys.platform != 'win32':
315 drive, _ = os.path.splitdrive(s)
316 unc, _ = os.path.splitunc(s)
317 unc_or_drive = unc or drive
318 norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
322 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
323 for path_part in norm_path]
325 sanitized_path.insert(0, unc_or_drive + os.path.sep)
326 return os.path.join(*sanitized_path)
329 def sanitize_url_path_consecutive_slashes(url):
330 """Collapses consecutive slashes in URLs' path"""
331 parsed_url = list(compat_urlparse.urlparse(url))
332 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
333 return compat_urlparse.urlunparse(parsed_url)
336 def orderedSet(iterable):
337 """ Remove all duplicates from the input iterable """
345 def _htmlentity_transform(entity):
346 """Transforms an HTML entity to a character."""
347 # Known non-numeric HTML entity
348 if entity in compat_html_entities.name2codepoint:
349 return compat_chr(compat_html_entities.name2codepoint[entity])
351 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
353 numstr = mobj.group(1)
354 if numstr.startswith('x'):
356 numstr = '0%s' % numstr
359 return compat_chr(int(numstr, base))
361 # Unknown entity in name, return its literal representation
362 return ('&%s;' % entity)
368 assert type(s) == compat_str
371 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
374 def encodeFilename(s, for_subprocess=False):
376 @param s The name of the file
379 assert type(s) == compat_str
381 # Python 3 has a Unicode API
382 if sys.version_info >= (3, 0):
385 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
386 # Pass '' directly to use Unicode APIs on Windows 2000 and up
387 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
388 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
389 if not for_subprocess:
392 # For subprocess calls, encode with locale encoding
393 # Refer to http://stackoverflow.com/a/9951851/35070
394 encoding = preferredencoding()
396 encoding = sys.getfilesystemencoding()
399 return s.encode(encoding, 'ignore')
402 def encodeArgument(s):
403 if not isinstance(s, compat_str):
404 # Legacy code that uses byte strings
405 # Uncomment the following line after fixing all post processors
406 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
407 s = s.decode('ascii')
408 return encodeFilename(s, True)
411 def decodeOption(optval):
414 if isinstance(optval, bytes):
415 optval = optval.decode(preferredencoding())
417 assert isinstance(optval, compat_str)
421 def formatSeconds(secs):
423 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
425 return '%d:%02d' % (secs // 60, secs % 60)
430 def make_HTTPS_handler(params, **kwargs):
431 opts_no_check_certificate = params.get('nocheckcertificate', False)
432 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
433 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
434 if opts_no_check_certificate:
435 context.check_hostname = False
436 context.verify_mode = ssl.CERT_NONE
438 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
441 # (create_default_context present but HTTPSHandler has no context=)
444 if sys.version_info < (3, 2):
445 return YoutubeDLHTTPSHandler(params, **kwargs)
447 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
448 context.verify_mode = (ssl.CERT_NONE
449 if opts_no_check_certificate
450 else ssl.CERT_REQUIRED)
451 context.set_default_verify_paths()
452 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
455 class ExtractorError(Exception):
456 """Error during info extraction."""
458 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
459 """ tb, if given, is the original traceback (so that it can be printed out).
460 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
463 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
465 if video_id is not None:
466 msg = video_id + ': ' + msg
468 msg += ' (caused by %r)' % cause
470 if ytdl_is_updateable():
471 update_cmd = 'type youtube-dl -U to update'
473 update_cmd = 'see https://yt-dl.org/update on how to update'
474 msg += '; please report this issue on https://yt-dl.org/bug .'
475 msg += ' Make sure you are using the latest version; %s.' % update_cmd
476 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
477 super(ExtractorError, self).__init__(msg)
480 self.exc_info = sys.exc_info() # preserve original exception
482 self.video_id = video_id
484 def format_traceback(self):
485 if self.traceback is None:
487 return ''.join(traceback.format_tb(self.traceback))
490 class UnsupportedError(ExtractorError):
491 def __init__(self, url):
492 super(UnsupportedError, self).__init__(
493 'Unsupported URL: %s' % url, expected=True)
497 class RegexNotFoundError(ExtractorError):
498 """Error when a regex didn't match"""
502 class DownloadError(Exception):
503 """Download Error exception.
505 This exception may be thrown by FileDownloader objects if they are not
506 configured to continue on errors. They will contain the appropriate
510 def __init__(self, msg, exc_info=None):
511 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
512 super(DownloadError, self).__init__(msg)
513 self.exc_info = exc_info
516 class SameFileError(Exception):
517 """Same File exception.
519 This exception will be thrown by FileDownloader objects if they detect
520 multiple files would have to be downloaded to the same file on disk.
525 class PostProcessingError(Exception):
526 """Post Processing exception.
528 This exception may be raised by PostProcessor's .run() method to
529 indicate an error in the postprocessing task.
532 def __init__(self, msg):
536 class MaxDownloadsReached(Exception):
537 """ --max-downloads limit has been reached. """
541 class UnavailableVideoError(Exception):
542 """Unavailable Format exception.
544 This exception will be thrown when a video is requested
545 in a format that is not available for that video.
550 class ContentTooShortError(Exception):
551 """Content Too Short exception.
553 This exception may be raised by FileDownloader objects when a file they
554 download is too small for what the server announced first, indicating
555 the connection was probably interrupted.
561 def __init__(self, downloaded, expected):
562 self.downloaded = downloaded
563 self.expected = expected
566 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
567 hc = http_class(*args, **kwargs)
568 source_address = ydl_handler._params.get('source_address')
569 if source_address is not None:
570 sa = (source_address, 0)
571 if hasattr(hc, 'source_address'): # Python 2.7+
572 hc.source_address = sa
574 def _hc_connect(self, *args, **kwargs):
575 sock = compat_socket_create_connection(
576 (self.host, self.port), self.timeout, sa)
578 self.sock = ssl.wrap_socket(
579 sock, self.key_file, self.cert_file,
580 ssl_version=ssl.PROTOCOL_TLSv1)
583 hc.connect = functools.partial(_hc_connect, hc)
588 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
589 """Handler for HTTP requests and responses.
591 This class, when installed with an OpenerDirector, automatically adds
592 the standard headers to every HTTP request and handles gzipped and
593 deflated responses from web servers. If compression is to be avoided in
594 a particular request, the original request in the program code only has
595 to include the HTTP header "Youtubedl-No-Compression", which will be
596 removed before making the real request.
598 Part of this code was copied from:
600 http://techknack.net/python-urllib2-handlers/
602 Andrew Rowls, the author of that code, agreed to release it to the
606 def __init__(self, params, *args, **kwargs):
607 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
608 self._params = params
610 def http_open(self, req):
611 return self.do_open(functools.partial(
612 _create_http_connection, self, compat_http_client.HTTPConnection, False),
618 return zlib.decompress(data, -zlib.MAX_WBITS)
620 return zlib.decompress(data)
623 def addinfourl_wrapper(stream, headers, url, code):
624 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
625 return compat_urllib_request.addinfourl(stream, headers, url, code)
626 ret = compat_urllib_request.addinfourl(stream, headers, url)
630 def http_request(self, req):
631 for h, v in std_headers.items():
632 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
633 # The dict keys are capitalized because of this bug by urllib
634 if h.capitalize() not in req.headers:
636 if 'Youtubedl-no-compression' in req.headers:
637 if 'Accept-encoding' in req.headers:
638 del req.headers['Accept-encoding']
639 del req.headers['Youtubedl-no-compression']
641 if sys.version_info < (2, 7) and '#' in req.get_full_url():
642 # Python 2.6 is brain-dead when it comes to fragments
643 req._Request__original = req._Request__original.partition('#')[0]
644 req._Request__r_type = req._Request__r_type.partition('#')[0]
648 def http_response(self, req, resp):
651 if resp.headers.get('Content-encoding', '') == 'gzip':
652 content = resp.read()
653 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
655 uncompressed = io.BytesIO(gz.read())
656 except IOError as original_ioerror:
657 # There may be junk add the end of the file
658 # See http://stackoverflow.com/q/4928560/35070 for details
659 for i in range(1, 1024):
661 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
662 uncompressed = io.BytesIO(gz.read())
667 raise original_ioerror
668 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
669 resp.msg = old_resp.msg
671 if resp.headers.get('Content-encoding', '') == 'deflate':
672 gz = io.BytesIO(self.deflate(resp.read()))
673 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
674 resp.msg = old_resp.msg
677 https_request = http_request
678 https_response = http_response
681 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
682 def __init__(self, params, https_conn_class=None, *args, **kwargs):
683 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
684 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
685 self._params = params
687 def https_open(self, req):
689 if hasattr(self, '_context'): # python > 2.6
690 kwargs['context'] = self._context
691 if hasattr(self, '_check_hostname'): # python 3.x
692 kwargs['check_hostname'] = self._check_hostname
693 return self.do_open(functools.partial(
694 _create_http_connection, self, self._https_conn_class, True),
698 def parse_iso8601(date_str, delimiter='T', timezone=None):
699 """ Return a UNIX timestamp from the given date """
706 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
709 timezone = datetime.timedelta()
711 date_str = date_str[:-len(m.group(0))]
712 if not m.group('sign'):
713 timezone = datetime.timedelta()
715 sign = 1 if m.group('sign') == '+' else -1
716 timezone = datetime.timedelta(
717 hours=sign * int(m.group('hours')),
718 minutes=sign * int(m.group('minutes')))
719 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
720 dt = datetime.datetime.strptime(date_str, date_format) - timezone
721 return calendar.timegm(dt.timetuple())
724 def unified_strdate(date_str, day_first=True):
725 """Return a string with the date in the format YYYYMMDD"""
731 date_str = date_str.replace(',', ' ')
732 # %z (UTC offset) is only supported in python>=3.2
733 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
734 # Remove AM/PM + timezone
735 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
737 format_expressions = [
742 '%b %dst %Y %I:%M%p',
743 '%b %dnd %Y %I:%M%p',
744 '%b %dth %Y %I:%M%p',
750 '%Y-%m-%d %H:%M:%S.%f',
753 '%Y-%m-%dT%H:%M:%SZ',
754 '%Y-%m-%dT%H:%M:%S.%fZ',
755 '%Y-%m-%dT%H:%M:%S.%f0Z',
757 '%Y-%m-%dT%H:%M:%S.%f',
761 format_expressions.extend([
769 format_expressions.extend([
776 for expression in format_expressions:
778 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
781 if upload_date is None:
782 timetuple = email.utils.parsedate_tz(date_str)
784 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
788 def determine_ext(url, default_ext='unknown_video'):
791 guess = url.partition('?')[0].rpartition('.')[2]
792 if re.match(r'^[A-Za-z0-9]+$', guess):
798 def subtitles_filename(filename, sub_lang, sub_format):
799 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
802 def date_from_str(date_str):
804 Return a datetime object from a string in the format YYYYMMDD or
805 (now|today)[+-][0-9](day|week|month|year)(s)?"""
806 today = datetime.date.today()
807 if date_str in ('now', 'today'):
809 if date_str == 'yesterday':
810 return today - datetime.timedelta(days=1)
811 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
812 if match is not None:
813 sign = match.group('sign')
814 time = int(match.group('time'))
817 unit = match.group('unit')
818 # A bad aproximation?
826 delta = datetime.timedelta(**{unit: time})
828 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
831 def hyphenate_date(date_str):
833 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
834 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
835 if match is not None:
836 return '-'.join(match.groups())
841 class DateRange(object):
842 """Represents a time interval between two dates"""
844 def __init__(self, start=None, end=None):
845 """start and end must be strings in the format accepted by date"""
846 if start is not None:
847 self.start = date_from_str(start)
849 self.start = datetime.datetime.min.date()
851 self.end = date_from_str(end)
853 self.end = datetime.datetime.max.date()
854 if self.start > self.end:
855 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
859 """Returns a range that only contains the given day"""
862 def __contains__(self, date):
863 """Check if the date is in the range"""
864 if not isinstance(date, datetime.date):
865 date = date_from_str(date)
866 return self.start <= date <= self.end
869 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
873 """ Returns the platform name as a compat_str """
874 res = platform.platform()
875 if isinstance(res, bytes):
876 res = res.decode(preferredencoding())
878 assert isinstance(res, compat_str)
882 def _windows_write_string(s, out):
883 """ Returns True if the string was written using special methods,
884 False if it has yet to be written out."""
885 # Adapted from http://stackoverflow.com/a/3259271/35070
888 import ctypes.wintypes
896 fileno = out.fileno()
897 except AttributeError:
898 # If the output stream doesn't have a fileno, it's virtual
900 except io.UnsupportedOperation:
901 # Some strange Windows pseudo files?
903 if fileno not in WIN_OUTPUT_IDS:
906 GetStdHandle = ctypes.WINFUNCTYPE(
907 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
908 (b"GetStdHandle", ctypes.windll.kernel32))
909 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
911 WriteConsoleW = ctypes.WINFUNCTYPE(
912 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
913 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
914 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
915 written = ctypes.wintypes.DWORD(0)
917 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
918 FILE_TYPE_CHAR = 0x0002
919 FILE_TYPE_REMOTE = 0x8000
920 GetConsoleMode = ctypes.WINFUNCTYPE(
921 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
922 ctypes.POINTER(ctypes.wintypes.DWORD))(
923 (b"GetConsoleMode", ctypes.windll.kernel32))
924 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
926 def not_a_console(handle):
927 if handle == INVALID_HANDLE_VALUE or handle is None:
929 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
930 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
935 def next_nonbmp_pos(s):
937 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
938 except StopIteration:
942 count = min(next_nonbmp_pos(s), 1024)
945 h, s, count if count else 2, ctypes.byref(written), None)
947 raise OSError('Failed to write string')
948 if not count: # We just wrote a non-BMP character
949 assert written.value == 2
952 assert written.value > 0
953 s = s[written.value:]
957 def write_string(s, out=None, encoding=None):
960 assert type(s) == compat_str
962 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
963 if _windows_write_string(s, out):
966 if ('b' in getattr(out, 'mode', '') or
967 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
968 byt = s.encode(encoding or preferredencoding(), 'ignore')
970 elif hasattr(out, 'buffer'):
971 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
972 byt = s.encode(enc, 'ignore')
973 out.buffer.write(byt)
979 def bytes_to_intlist(bs):
982 if isinstance(bs[0], int): # Python 3
985 return [ord(c) for c in bs]
988 def intlist_to_bytes(xs):
991 return struct_pack('%dB' % len(xs), *xs)
994 # Cross-platform file locking
995 if sys.platform == 'win32':
996 import ctypes.wintypes
999 class OVERLAPPED(ctypes.Structure):
1001 ('Internal', ctypes.wintypes.LPVOID),
1002 ('InternalHigh', ctypes.wintypes.LPVOID),
1003 ('Offset', ctypes.wintypes.DWORD),
1004 ('OffsetHigh', ctypes.wintypes.DWORD),
1005 ('hEvent', ctypes.wintypes.HANDLE),
1008 kernel32 = ctypes.windll.kernel32
1009 LockFileEx = kernel32.LockFileEx
1010 LockFileEx.argtypes = [
1011 ctypes.wintypes.HANDLE, # hFile
1012 ctypes.wintypes.DWORD, # dwFlags
1013 ctypes.wintypes.DWORD, # dwReserved
1014 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1015 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1016 ctypes.POINTER(OVERLAPPED) # Overlapped
1018 LockFileEx.restype = ctypes.wintypes.BOOL
1019 UnlockFileEx = kernel32.UnlockFileEx
1020 UnlockFileEx.argtypes = [
1021 ctypes.wintypes.HANDLE, # hFile
1022 ctypes.wintypes.DWORD, # dwReserved
1023 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1024 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1025 ctypes.POINTER(OVERLAPPED) # Overlapped
1027 UnlockFileEx.restype = ctypes.wintypes.BOOL
1028 whole_low = 0xffffffff
1029 whole_high = 0x7fffffff
1031 def _lock_file(f, exclusive):
1032 overlapped = OVERLAPPED()
1033 overlapped.Offset = 0
1034 overlapped.OffsetHigh = 0
1035 overlapped.hEvent = 0
1036 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1037 handle = msvcrt.get_osfhandle(f.fileno())
1038 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1039 whole_low, whole_high, f._lock_file_overlapped_p):
1040 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1042 def _unlock_file(f):
1043 assert f._lock_file_overlapped_p
1044 handle = msvcrt.get_osfhandle(f.fileno())
1045 if not UnlockFileEx(handle, 0,
1046 whole_low, whole_high, f._lock_file_overlapped_p):
1047 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1052 def _lock_file(f, exclusive):
1053 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1055 def _unlock_file(f):
1056 fcntl.flock(f, fcntl.LOCK_UN)
1059 class locked_file(object):
1060 def __init__(self, filename, mode, encoding=None):
1061 assert mode in ['r', 'a', 'w']
1062 self.f = io.open(filename, mode, encoding=encoding)
1065 def __enter__(self):
1066 exclusive = self.mode != 'r'
1068 _lock_file(self.f, exclusive)
1074 def __exit__(self, etype, value, traceback):
1076 _unlock_file(self.f)
1083 def write(self, *args):
1084 return self.f.write(*args)
1086 def read(self, *args):
1087 return self.f.read(*args)
1090 def get_filesystem_encoding():
1091 encoding = sys.getfilesystemencoding()
1092 return encoding if encoding is not None else 'utf-8'
1095 def shell_quote(args):
1097 encoding = get_filesystem_encoding()
1099 if isinstance(a, bytes):
1100 # We may get a filename encoded with 'encodeFilename'
1101 a = a.decode(encoding)
1102 quoted_args.append(pipes.quote(a))
1103 return ' '.join(quoted_args)
1106 def takewhile_inclusive(pred, seq):
1107 """ Like itertools.takewhile, but include the latest evaluated element
1108 (the first element so that Not pred(e)) """
1115 def smuggle_url(url, data):
1116 """ Pass additional data in a URL for internal use. """
1118 sdata = compat_urllib_parse.urlencode(
1119 {'__youtubedl_smuggle': json.dumps(data)})
1120 return url + '#' + sdata
1123 def unsmuggle_url(smug_url, default=None):
1124 if '#__youtubedl_smuggle' not in smug_url:
1125 return smug_url, default
1126 url, _, sdata = smug_url.rpartition('#')
1127 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1128 data = json.loads(jsond)
1132 def format_bytes(bytes):
1135 if type(bytes) is str:
1136 bytes = float(bytes)
1140 exponent = int(math.log(bytes, 1024.0))
1141 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1142 converted = float(bytes) / float(1024 ** exponent)
1143 return '%.2f%s' % (converted, suffix)
1146 def parse_filesize(s):
1150 # The lower-case forms are of course incorrect and inofficial,
1151 # but we support those too
1189 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1191 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1195 num_str = m.group('num').replace(',', '.')
1196 mult = _UNIT_TABLE[m.group('unit')]
1197 return int(float(num_str) * mult)
1200 def month_by_name(name):
1201 """ Return the number of a month by (locale-independently) English name """
1204 return ENGLISH_MONTH_NAMES.index(name) + 1
1209 def month_by_abbreviation(abbrev):
1210 """ Return the number of a month by (locale-independently) English
1214 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1219 def fix_xml_ampersands(xml_str):
1220 """Replace all the '&' by '&' in XML"""
1222 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1227 def setproctitle(title):
1228 assert isinstance(title, compat_str)
1230 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1233 title_bytes = title.encode('utf-8')
1234 buf = ctypes.create_string_buffer(len(title_bytes))
1235 buf.value = title_bytes
1237 libc.prctl(15, buf, 0, 0, 0)
1238 except AttributeError:
1239 return # Strange libc, just skip this
1242 def remove_start(s, start):
1243 if s.startswith(start):
1244 return s[len(start):]
1248 def remove_end(s, end):
1250 return s[:-len(end)]
1254 def url_basename(url):
1255 path = compat_urlparse.urlparse(url).path
1256 return path.strip('/').split('/')[-1]
1259 class HEADRequest(compat_urllib_request.Request):
1260 def get_method(self):
1264 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1267 v = getattr(v, get_attr, None)
1270 return default if v is None else (int(v) * invscale // scale)
1273 def str_or_none(v, default=None):
1274 return default if v is None else compat_str(v)
1277 def str_to_int(int_str):
1278 """ A more relaxed version of int_or_none """
1281 int_str = re.sub(r'[,\.\+]', '', int_str)
1285 def float_or_none(v, scale=1, invscale=1, default=None):
1286 return default if v is None else (float(v) * invscale / scale)
1289 def parse_duration(s):
1290 if not isinstance(s, compat_basestring):
1298 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1299 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1301 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1304 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1305 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1307 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1309 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1314 if m.group('only_mins'):
1315 return float_or_none(m.group('only_mins'), invscale=60)
1316 if m.group('only_hours'):
1317 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1319 res += int(m.group('secs'))
1320 if m.group('mins_reversed'):
1321 res += int(m.group('mins_reversed')) * 60
1323 res += int(m.group('mins')) * 60
1324 if m.group('hours'):
1325 res += int(m.group('hours')) * 60 * 60
1326 if m.group('hours_reversed'):
1327 res += int(m.group('hours_reversed')) * 60 * 60
1329 res += int(m.group('days')) * 24 * 60 * 60
1331 res += float(m.group('ms'))
1335 def prepend_extension(filename, ext):
1336 name, real_ext = os.path.splitext(filename)
1337 return '{0}.{1}{2}'.format(name, ext, real_ext)
1340 def check_executable(exe, args=[]):
1341 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1342 args can be a list of arguments for a short output (like -version) """
1344 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1350 def get_exe_version(exe, args=['--version'],
1351 version_re=None, unrecognized='present'):
1352 """ Returns the version of the specified executable,
1353 or False if the executable is not present """
1355 out, _ = subprocess.Popen(
1357 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1360 if isinstance(out, bytes): # Python 2.x
1361 out = out.decode('ascii', 'ignore')
1362 return detect_exe_version(out, version_re, unrecognized)
1365 def detect_exe_version(output, version_re=None, unrecognized='present'):
1366 assert isinstance(output, compat_str)
1367 if version_re is None:
1368 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1369 m = re.search(version_re, output)
1376 class PagedList(object):
1378 # This is only useful for tests
1379 return len(self.getslice())
1382 class OnDemandPagedList(PagedList):
1383 def __init__(self, pagefunc, pagesize):
1384 self._pagefunc = pagefunc
1385 self._pagesize = pagesize
1387 def getslice(self, start=0, end=None):
1389 for pagenum in itertools.count(start // self._pagesize):
1390 firstid = pagenum * self._pagesize
1391 nextfirstid = pagenum * self._pagesize + self._pagesize
1392 if start >= nextfirstid:
1395 page_results = list(self._pagefunc(pagenum))
1398 start % self._pagesize
1399 if firstid <= start < nextfirstid
1403 ((end - 1) % self._pagesize) + 1
1404 if (end is not None and firstid <= end <= nextfirstid)
1407 if startv != 0 or endv is not None:
1408 page_results = page_results[startv:endv]
1409 res.extend(page_results)
1411 # A little optimization - if current page is not "full", ie. does
1412 # not contain page_size videos then we can assume that this page
1413 # is the last one - there are no more ids on further pages -
1414 # i.e. no need to query again.
1415 if len(page_results) + startv < self._pagesize:
1418 # If we got the whole page, but the next page is not interesting,
1419 # break out early as well
1420 if end == nextfirstid:
1425 class InAdvancePagedList(PagedList):
1426 def __init__(self, pagefunc, pagecount, pagesize):
1427 self._pagefunc = pagefunc
1428 self._pagecount = pagecount
1429 self._pagesize = pagesize
1431 def getslice(self, start=0, end=None):
1433 start_page = start // self._pagesize
1435 self._pagecount if end is None else (end // self._pagesize + 1))
1436 skip_elems = start - start_page * self._pagesize
1437 only_more = None if end is None else end - start
1438 for pagenum in range(start_page, end_page):
1439 page = list(self._pagefunc(pagenum))
1441 page = page[skip_elems:]
1443 if only_more is not None:
1444 if len(page) < only_more:
1445 only_more -= len(page)
1447 page = page[:only_more]
1454 def uppercase_escape(s):
1455 unicode_escape = codecs.getdecoder('unicode_escape')
1457 r'\\U[0-9a-fA-F]{8}',
1458 lambda m: unicode_escape(m.group(0))[0],
1462 def escape_rfc3986(s):
1463 """Escape non-ASCII characters as suggested by RFC 3986"""
1464 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1465 s = s.encode('utf-8')
1466 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1469 def escape_url(url):
1470 """Escape URL as suggested by RFC 3986"""
1471 url_parsed = compat_urllib_parse_urlparse(url)
1472 return url_parsed._replace(
1473 path=escape_rfc3986(url_parsed.path),
1474 params=escape_rfc3986(url_parsed.params),
1475 query=escape_rfc3986(url_parsed.query),
1476 fragment=escape_rfc3986(url_parsed.fragment)
1480 struct.pack('!I', 0)
1482 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1483 def struct_pack(spec, *args):
1484 if isinstance(spec, compat_str):
1485 spec = spec.encode('ascii')
1486 return struct.pack(spec, *args)
1488 def struct_unpack(spec, *args):
1489 if isinstance(spec, compat_str):
1490 spec = spec.encode('ascii')
1491 return struct.unpack(spec, *args)
1493 struct_pack = struct.pack
1494 struct_unpack = struct.unpack
1497 def read_batch_urls(batch_fd):
1499 if not isinstance(url, compat_str):
1500 url = url.decode('utf-8', 'replace')
1501 BOM_UTF8 = '\xef\xbb\xbf'
1502 if url.startswith(BOM_UTF8):
1503 url = url[len(BOM_UTF8):]
1505 if url.startswith(('#', ';', ']')):
1509 with contextlib.closing(batch_fd) as fd:
1510 return [url for url in map(fixup, fd) if url]
1513 def urlencode_postdata(*args, **kargs):
1514 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1518 etree_iter = xml.etree.ElementTree.Element.iter
1519 except AttributeError: # Python <=2.6
1520 etree_iter = lambda n: n.findall('.//*')
1524 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1525 def doctype(self, name, pubid, system):
1526 pass # Ignore doctypes
1528 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1529 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1530 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1531 # Fix up XML parser in Python 2.x
1532 if sys.version_info < (3, 0):
1533 for n in etree_iter(tree):
1534 if n.text is not None:
1535 if not isinstance(n.text, compat_str):
1536 n.text = n.text.decode('utf-8')
1549 def parse_age_limit(s):
1552 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1553 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1556 def strip_jsonp(code):
1558 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1561 def js_to_json(code):
1564 if v in ('true', 'false', 'null'):
1566 if v.startswith('"'):
1568 if v.startswith("'"):
1570 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1577 res = re.sub(r'''(?x)
1578 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1579 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1580 [a-zA-Z_][.a-zA-Z_0-9]*
1582 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1586 def qualities(quality_ids):
1587 """ Get a numeric quality value out of a list of possible values """
1590 return quality_ids.index(qid)
1596 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1599 def limit_length(s, length):
1600 """ Add ellipses to overly long strings """
1605 return s[:length - len(ELLIPSES)] + ELLIPSES
1609 def version_tuple(v):
1610 return tuple(int(e) for e in re.split(r'[-.]', v))
1613 def is_outdated_version(version, limit, assume_new=True):
1615 return not assume_new
1617 return version_tuple(version) < version_tuple(limit)
1619 return not assume_new
1622 def ytdl_is_updateable():
1623 """ Returns if youtube-dl can be updated with -U """
1624 from zipimport import zipimporter
1626 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1629 def args_to_str(args):
1630 # Get a short string representation for a subprocess command
1631 return ' '.join(shlex_quote(a) for a in args)
1634 def mimetype2ext(mt):
1635 _, _, res = mt.rpartition('/')
1639 'x-mp4-fragmented': 'mp4',
1643 def urlhandle_detect_ext(url_handle):
1646 getheader = lambda h: url_handle.headers[h]
1647 except AttributeError: # Python < 3
1648 getheader = url_handle.info().getheader
1650 cd = getheader('Content-Disposition')
1652 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1654 e = determine_ext(m.group('filename'), default_ext=None)
1658 return mimetype2ext(getheader('Content-Type'))
1661 def age_restricted(content_limit, age_limit):
1662 """ Returns True iff the content should be blocked """
1664 if age_limit is None: # No limit set
1666 if content_limit is None:
1667 return False # Content available for everyone
1668 return age_limit < content_limit
1671 def is_html(first_bytes):
1672 """ Detect whether a file contains HTML by examining its first bytes. """
1675 (b'\xef\xbb\xbf', 'utf-8'),
1676 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1677 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1678 (b'\xff\xfe', 'utf-16-le'),
1679 (b'\xfe\xff', 'utf-16-be'),
1681 for bom, enc in BOMS:
1682 if first_bytes.startswith(bom):
1683 s = first_bytes[len(bom):].decode(enc, 'replace')
1686 s = first_bytes.decode('utf-8', 'replace')
1688 return re.match(r'^\s*<', s)
1691 def determine_protocol(info_dict):
1692 protocol = info_dict.get('protocol')
1693 if protocol is not None:
1696 url = info_dict['url']
1697 if url.startswith('rtmp'):
1699 elif url.startswith('mms'):
1701 elif url.startswith('rtsp'):
1704 ext = determine_ext(url)
1710 return compat_urllib_parse_urlparse(url).scheme
1713 def render_table(header_row, data):
1714 """ Render a list of rows, each as a list of values """
1715 table = [header_row] + data
1716 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1717 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1718 return '\n'.join(format_str % tuple(row) for row in table)
1721 def _match_one(filter_part, dct):
1722 COMPARISON_OPERATORS = {
1730 operator_rex = re.compile(r'''(?x)\s*
1732 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1734 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1735 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1738 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1739 m = operator_rex.search(filter_part)
1741 op = COMPARISON_OPERATORS[m.group('op')]
1742 if m.group('strval') is not None:
1743 if m.group('op') not in ('=', '!='):
1745 'Operator %s does not support string values!' % m.group('op'))
1746 comparison_value = m.group('strval')
1749 comparison_value = int(m.group('intval'))
1751 comparison_value = parse_filesize(m.group('intval'))
1752 if comparison_value is None:
1753 comparison_value = parse_filesize(m.group('intval') + 'B')
1754 if comparison_value is None:
1756 'Invalid integer value %r in filter part %r' % (
1757 m.group('intval'), filter_part))
1758 actual_value = dct.get(m.group('key'))
1759 if actual_value is None:
1760 return m.group('none_inclusive')
1761 return op(actual_value, comparison_value)
1764 '': lambda v: v is not None,
1765 '!': lambda v: v is None,
1767 operator_rex = re.compile(r'''(?x)\s*
1768 (?P<op>%s)\s*(?P<key>[a-z_]+)
1770 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1771 m = operator_rex.search(filter_part)
1773 op = UNARY_OPERATORS[m.group('op')]
1774 actual_value = dct.get(m.group('key'))
1775 return op(actual_value)
1777 raise ValueError('Invalid filter part %r' % filter_part)
1780 def match_str(filter_str, dct):
1781 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1784 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1787 def match_filter_func(filter_str):
1788 def _match_func(info_dict):
1789 if match_str(filter_str, info_dict):
1792 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1793 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1797 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1798 def __init__(self, proxies=None):
1799 # Set default handlers
1800 for type in ('http', 'https'):
1801 setattr(self, '%s_open' % type,
1802 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1803 meth(r, proxy, type))
1804 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1806 def proxy_open(self, req, proxy, type):
1807 req_proxy = req.headers.get('Ytdl-request-proxy')
1808 if req_proxy is not None:
1810 del req.headers['Ytdl-request-proxy']
1812 if proxy == '__noproxy__':
1813 return None # No Proxy
1814 return compat_urllib_request.ProxyHandler.proxy_open(
1815 self, req, proxy, type)