2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
41 compat_socket_create_connection,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
76 pref = locale.getpreferredencoding()
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
117 tf = tempfile.NamedTemporaryFile(**args)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
129 os.rename(tf.name, fn)
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
166 replaced.append(c[0])
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
178 if n is None or n.text is None:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
195 m = re.search(r'''(?xs)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 ''' % (re.escape(attribute), re.escape(value)), html)
207 res = m.group('content')
209 if res.startswith('"') or res.startswith("'"):
212 return unescapeHTML(res)
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
218 if html is None: # Convenience for sanitizing descriptions etc.
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = os.path.join(
256 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
257 for path_part in os.path.split(filename)
259 if alt_filename == filename:
262 # An exception here should be caught in the caller
263 stream = open(encodeFilename(filename), open_mode)
264 return (stream, alt_filename)
267 def timeconvert(timestr):
268 """Convert RFC 2822 defined time string into system timestamp"""
270 timetuple = email.utils.parsedate_tz(timestr)
271 if timetuple is not None:
272 timestamp = email.utils.mktime_tz(timetuple)
276 def sanitize_filename(s, restricted=False, is_id=False):
277 """Sanitizes a string so it could be used as part of a filename.
278 If restricted is set, use a stricter subset of allowed characters.
279 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
281 def replace_insane(char):
282 if char == '?' or ord(char) < 32 or ord(char) == 127:
285 return '' if restricted else '\''
287 return '_-' if restricted else ' -'
288 elif char in '\\/|*<>':
290 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
292 if restricted and ord(char) > 127:
297 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
298 result = ''.join(map(replace_insane, s))
300 while '__' in result:
301 result = result.replace('__', '_')
302 result = result.strip('_')
303 # Common case of "Foreign band name - English song title"
304 if restricted and result.startswith('-_'):
306 if result.startswith('-'):
307 result = '_' + result[len('-'):]
308 result = result.lstrip('.')
314 def sanitize_path(s):
315 """Sanitizes and normalizes path on Windows"""
316 if sys.platform != 'win32':
318 drive, _ = os.path.splitdrive(s)
319 unc, _ = os.path.splitunc(s)
320 unc_or_drive = unc or drive
321 norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
325 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
326 for path_part in norm_path]
328 sanitized_path.insert(0, unc_or_drive + os.path.sep)
329 return os.path.join(*sanitized_path)
332 def orderedSet(iterable):
333 """ Remove all duplicates from the input iterable """
341 def _htmlentity_transform(entity):
342 """Transforms an HTML entity to a character."""
343 # Known non-numeric HTML entity
344 if entity in compat_html_entities.name2codepoint:
345 return compat_chr(compat_html_entities.name2codepoint[entity])
347 mobj = re.match(r'#(x?[0-9]+)', entity)
349 numstr = mobj.group(1)
350 if numstr.startswith('x'):
352 numstr = '0%s' % numstr
355 return compat_chr(int(numstr, base))
357 # Unknown entity in name, return its literal representation
358 return ('&%s;' % entity)
364 assert type(s) == compat_str
367 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
370 def encodeFilename(s, for_subprocess=False):
372 @param s The name of the file
375 assert type(s) == compat_str
377 # Python 3 has a Unicode API
378 if sys.version_info >= (3, 0):
381 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
382 # Pass '' directly to use Unicode APIs on Windows 2000 and up
383 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
384 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
385 if not for_subprocess:
388 # For subprocess calls, encode with locale encoding
389 # Refer to http://stackoverflow.com/a/9951851/35070
390 encoding = preferredencoding()
392 encoding = sys.getfilesystemencoding()
395 return s.encode(encoding, 'ignore')
398 def encodeArgument(s):
399 if not isinstance(s, compat_str):
400 # Legacy code that uses byte strings
401 # Uncomment the following line after fixing all post processors
402 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
403 s = s.decode('ascii')
404 return encodeFilename(s, True)
407 def decodeOption(optval):
410 if isinstance(optval, bytes):
411 optval = optval.decode(preferredencoding())
413 assert isinstance(optval, compat_str)
417 def formatSeconds(secs):
419 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
421 return '%d:%02d' % (secs // 60, secs % 60)
426 def make_HTTPS_handler(params, **kwargs):
427 opts_no_check_certificate = params.get('nocheckcertificate', False)
428 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
429 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
430 if opts_no_check_certificate:
431 context.check_hostname = False
432 context.verify_mode = ssl.CERT_NONE
434 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
437 # (create_default_context present but HTTPSHandler has no context=)
440 if sys.version_info < (3, 2):
441 return YoutubeDLHTTPSHandler(params, **kwargs)
443 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
444 context.verify_mode = (ssl.CERT_NONE
445 if opts_no_check_certificate
446 else ssl.CERT_REQUIRED)
447 context.set_default_verify_paths()
448 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
451 class ExtractorError(Exception):
452 """Error during info extraction."""
454 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
455 """ tb, if given, is the original traceback (so that it can be printed out).
456 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
459 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
461 if video_id is not None:
462 msg = video_id + ': ' + msg
464 msg += ' (caused by %r)' % cause
466 if ytdl_is_updateable():
467 update_cmd = 'type youtube-dl -U to update'
469 update_cmd = 'see https://yt-dl.org/update on how to update'
470 msg += '; please report this issue on https://yt-dl.org/bug .'
471 msg += ' Make sure you are using the latest version; %s.' % update_cmd
472 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
473 super(ExtractorError, self).__init__(msg)
476 self.exc_info = sys.exc_info() # preserve original exception
478 self.video_id = video_id
480 def format_traceback(self):
481 if self.traceback is None:
483 return ''.join(traceback.format_tb(self.traceback))
486 class UnsupportedError(ExtractorError):
487 def __init__(self, url):
488 super(UnsupportedError, self).__init__(
489 'Unsupported URL: %s' % url, expected=True)
493 class RegexNotFoundError(ExtractorError):
494 """Error when a regex didn't match"""
498 class DownloadError(Exception):
499 """Download Error exception.
501 This exception may be thrown by FileDownloader objects if they are not
502 configured to continue on errors. They will contain the appropriate
506 def __init__(self, msg, exc_info=None):
507 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
508 super(DownloadError, self).__init__(msg)
509 self.exc_info = exc_info
512 class SameFileError(Exception):
513 """Same File exception.
515 This exception will be thrown by FileDownloader objects if they detect
516 multiple files would have to be downloaded to the same file on disk.
521 class PostProcessingError(Exception):
522 """Post Processing exception.
524 This exception may be raised by PostProcessor's .run() method to
525 indicate an error in the postprocessing task.
528 def __init__(self, msg):
532 class MaxDownloadsReached(Exception):
533 """ --max-downloads limit has been reached. """
537 class UnavailableVideoError(Exception):
538 """Unavailable Format exception.
540 This exception will be thrown when a video is requested
541 in a format that is not available for that video.
546 class ContentTooShortError(Exception):
547 """Content Too Short exception.
549 This exception may be raised by FileDownloader objects when a file they
550 download is too small for what the server announced first, indicating
551 the connection was probably interrupted.
557 def __init__(self, downloaded, expected):
558 self.downloaded = downloaded
559 self.expected = expected
562 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
563 hc = http_class(*args, **kwargs)
564 source_address = ydl_handler._params.get('source_address')
565 if source_address is not None:
566 sa = (source_address, 0)
567 if hasattr(hc, 'source_address'): # Python 2.7+
568 hc.source_address = sa
570 def _hc_connect(self, *args, **kwargs):
571 sock = compat_socket_create_connection(
572 (self.host, self.port), self.timeout, sa)
574 self.sock = ssl.wrap_socket(
575 sock, self.key_file, self.cert_file,
576 ssl_version=ssl.PROTOCOL_TLSv1)
579 hc.connect = functools.partial(_hc_connect, hc)
584 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
585 """Handler for HTTP requests and responses.
587 This class, when installed with an OpenerDirector, automatically adds
588 the standard headers to every HTTP request and handles gzipped and
589 deflated responses from web servers. If compression is to be avoided in
590 a particular request, the original request in the program code only has
591 to include the HTTP header "Youtubedl-No-Compression", which will be
592 removed before making the real request.
594 Part of this code was copied from:
596 http://techknack.net/python-urllib2-handlers/
598 Andrew Rowls, the author of that code, agreed to release it to the
602 def __init__(self, params, *args, **kwargs):
603 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
604 self._params = params
606 def http_open(self, req):
607 return self.do_open(functools.partial(
608 _create_http_connection, self, compat_http_client.HTTPConnection, False),
614 return zlib.decompress(data, -zlib.MAX_WBITS)
616 return zlib.decompress(data)
619 def addinfourl_wrapper(stream, headers, url, code):
620 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
621 return compat_urllib_request.addinfourl(stream, headers, url, code)
622 ret = compat_urllib_request.addinfourl(stream, headers, url)
626 def http_request(self, req):
627 for h, v in std_headers.items():
628 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
629 # The dict keys are capitalized because of this bug by urllib
630 if h.capitalize() not in req.headers:
632 if 'Youtubedl-no-compression' in req.headers:
633 if 'Accept-encoding' in req.headers:
634 del req.headers['Accept-encoding']
635 del req.headers['Youtubedl-no-compression']
637 if sys.version_info < (2, 7) and '#' in req.get_full_url():
638 # Python 2.6 is brain-dead when it comes to fragments
639 req._Request__original = req._Request__original.partition('#')[0]
640 req._Request__r_type = req._Request__r_type.partition('#')[0]
644 def http_response(self, req, resp):
647 if resp.headers.get('Content-encoding', '') == 'gzip':
648 content = resp.read()
649 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
651 uncompressed = io.BytesIO(gz.read())
652 except IOError as original_ioerror:
653 # There may be junk add the end of the file
654 # See http://stackoverflow.com/q/4928560/35070 for details
655 for i in range(1, 1024):
657 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
658 uncompressed = io.BytesIO(gz.read())
663 raise original_ioerror
664 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
665 resp.msg = old_resp.msg
667 if resp.headers.get('Content-encoding', '') == 'deflate':
668 gz = io.BytesIO(self.deflate(resp.read()))
669 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
670 resp.msg = old_resp.msg
673 https_request = http_request
674 https_response = http_response
677 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
678 def __init__(self, params, https_conn_class=None, *args, **kwargs):
679 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
680 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
681 self._params = params
683 def https_open(self, req):
685 if hasattr(self, '_context'): # python > 2.6
686 kwargs['context'] = self._context
687 if hasattr(self, '_check_hostname'): # python 3.x
688 kwargs['check_hostname'] = self._check_hostname
689 return self.do_open(functools.partial(
690 _create_http_connection, self, self._https_conn_class, True),
694 def parse_iso8601(date_str, delimiter='T', timezone=None):
695 """ Return a UNIX timestamp from the given date """
702 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
705 timezone = datetime.timedelta()
707 date_str = date_str[:-len(m.group(0))]
708 if not m.group('sign'):
709 timezone = datetime.timedelta()
711 sign = 1 if m.group('sign') == '+' else -1
712 timezone = datetime.timedelta(
713 hours=sign * int(m.group('hours')),
714 minutes=sign * int(m.group('minutes')))
715 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
716 dt = datetime.datetime.strptime(date_str, date_format) - timezone
717 return calendar.timegm(dt.timetuple())
720 def unified_strdate(date_str, day_first=True):
721 """Return a string with the date in the format YYYYMMDD"""
727 date_str = date_str.replace(',', ' ')
728 # %z (UTC offset) is only supported in python>=3.2
729 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
730 # Remove AM/PM + timezone
731 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
733 format_expressions = [
738 '%b %dst %Y %I:%M%p',
739 '%b %dnd %Y %I:%M%p',
740 '%b %dth %Y %I:%M%p',
746 '%Y-%m-%d %H:%M:%S.%f',
749 '%Y-%m-%dT%H:%M:%SZ',
750 '%Y-%m-%dT%H:%M:%S.%fZ',
751 '%Y-%m-%dT%H:%M:%S.%f0Z',
753 '%Y-%m-%dT%H:%M:%S.%f',
757 format_expressions.extend([
764 format_expressions.extend([
770 for expression in format_expressions:
772 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
775 if upload_date is None:
776 timetuple = email.utils.parsedate_tz(date_str)
778 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
782 def determine_ext(url, default_ext='unknown_video'):
785 guess = url.partition('?')[0].rpartition('.')[2]
786 if re.match(r'^[A-Za-z0-9]+$', guess):
792 def subtitles_filename(filename, sub_lang, sub_format):
793 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
796 def date_from_str(date_str):
798 Return a datetime object from a string in the format YYYYMMDD or
799 (now|today)[+-][0-9](day|week|month|year)(s)?"""
800 today = datetime.date.today()
801 if date_str in ('now', 'today'):
803 if date_str == 'yesterday':
804 return today - datetime.timedelta(days=1)
805 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
806 if match is not None:
807 sign = match.group('sign')
808 time = int(match.group('time'))
811 unit = match.group('unit')
812 # A bad aproximation?
820 delta = datetime.timedelta(**{unit: time})
822 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
825 def hyphenate_date(date_str):
827 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
828 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
829 if match is not None:
830 return '-'.join(match.groups())
835 class DateRange(object):
836 """Represents a time interval between two dates"""
838 def __init__(self, start=None, end=None):
839 """start and end must be strings in the format accepted by date"""
840 if start is not None:
841 self.start = date_from_str(start)
843 self.start = datetime.datetime.min.date()
845 self.end = date_from_str(end)
847 self.end = datetime.datetime.max.date()
848 if self.start > self.end:
849 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
853 """Returns a range that only contains the given day"""
856 def __contains__(self, date):
857 """Check if the date is in the range"""
858 if not isinstance(date, datetime.date):
859 date = date_from_str(date)
860 return self.start <= date <= self.end
863 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
867 """ Returns the platform name as a compat_str """
868 res = platform.platform()
869 if isinstance(res, bytes):
870 res = res.decode(preferredencoding())
872 assert isinstance(res, compat_str)
876 def _windows_write_string(s, out):
877 """ Returns True if the string was written using special methods,
878 False if it has yet to be written out."""
879 # Adapted from http://stackoverflow.com/a/3259271/35070
882 import ctypes.wintypes
890 fileno = out.fileno()
891 except AttributeError:
892 # If the output stream doesn't have a fileno, it's virtual
894 except io.UnsupportedOperation:
895 # Some strange Windows pseudo files?
897 if fileno not in WIN_OUTPUT_IDS:
900 GetStdHandle = ctypes.WINFUNCTYPE(
901 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
902 (b"GetStdHandle", ctypes.windll.kernel32))
903 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
905 WriteConsoleW = ctypes.WINFUNCTYPE(
906 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
907 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
908 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
909 written = ctypes.wintypes.DWORD(0)
911 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
912 FILE_TYPE_CHAR = 0x0002
913 FILE_TYPE_REMOTE = 0x8000
914 GetConsoleMode = ctypes.WINFUNCTYPE(
915 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
916 ctypes.POINTER(ctypes.wintypes.DWORD))(
917 (b"GetConsoleMode", ctypes.windll.kernel32))
918 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
920 def not_a_console(handle):
921 if handle == INVALID_HANDLE_VALUE or handle is None:
923 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
924 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
929 def next_nonbmp_pos(s):
931 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
932 except StopIteration:
936 count = min(next_nonbmp_pos(s), 1024)
939 h, s, count if count else 2, ctypes.byref(written), None)
941 raise OSError('Failed to write string')
942 if not count: # We just wrote a non-BMP character
943 assert written.value == 2
946 assert written.value > 0
947 s = s[written.value:]
951 def write_string(s, out=None, encoding=None):
954 assert type(s) == compat_str
956 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
957 if _windows_write_string(s, out):
960 if ('b' in getattr(out, 'mode', '') or
961 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
962 byt = s.encode(encoding or preferredencoding(), 'ignore')
964 elif hasattr(out, 'buffer'):
965 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
966 byt = s.encode(enc, 'ignore')
967 out.buffer.write(byt)
973 def bytes_to_intlist(bs):
976 if isinstance(bs[0], int): # Python 3
979 return [ord(c) for c in bs]
982 def intlist_to_bytes(xs):
985 return struct_pack('%dB' % len(xs), *xs)
988 # Cross-platform file locking
989 if sys.platform == 'win32':
990 import ctypes.wintypes
993 class OVERLAPPED(ctypes.Structure):
995 ('Internal', ctypes.wintypes.LPVOID),
996 ('InternalHigh', ctypes.wintypes.LPVOID),
997 ('Offset', ctypes.wintypes.DWORD),
998 ('OffsetHigh', ctypes.wintypes.DWORD),
999 ('hEvent', ctypes.wintypes.HANDLE),
1002 kernel32 = ctypes.windll.kernel32
1003 LockFileEx = kernel32.LockFileEx
1004 LockFileEx.argtypes = [
1005 ctypes.wintypes.HANDLE, # hFile
1006 ctypes.wintypes.DWORD, # dwFlags
1007 ctypes.wintypes.DWORD, # dwReserved
1008 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1009 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1010 ctypes.POINTER(OVERLAPPED) # Overlapped
1012 LockFileEx.restype = ctypes.wintypes.BOOL
1013 UnlockFileEx = kernel32.UnlockFileEx
1014 UnlockFileEx.argtypes = [
1015 ctypes.wintypes.HANDLE, # hFile
1016 ctypes.wintypes.DWORD, # dwReserved
1017 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1018 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1019 ctypes.POINTER(OVERLAPPED) # Overlapped
1021 UnlockFileEx.restype = ctypes.wintypes.BOOL
1022 whole_low = 0xffffffff
1023 whole_high = 0x7fffffff
1025 def _lock_file(f, exclusive):
1026 overlapped = OVERLAPPED()
1027 overlapped.Offset = 0
1028 overlapped.OffsetHigh = 0
1029 overlapped.hEvent = 0
1030 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1031 handle = msvcrt.get_osfhandle(f.fileno())
1032 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1033 whole_low, whole_high, f._lock_file_overlapped_p):
1034 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1036 def _unlock_file(f):
1037 assert f._lock_file_overlapped_p
1038 handle = msvcrt.get_osfhandle(f.fileno())
1039 if not UnlockFileEx(handle, 0,
1040 whole_low, whole_high, f._lock_file_overlapped_p):
1041 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1046 def _lock_file(f, exclusive):
1047 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1049 def _unlock_file(f):
1050 fcntl.flock(f, fcntl.LOCK_UN)
1053 class locked_file(object):
1054 def __init__(self, filename, mode, encoding=None):
1055 assert mode in ['r', 'a', 'w']
1056 self.f = io.open(filename, mode, encoding=encoding)
1059 def __enter__(self):
1060 exclusive = self.mode != 'r'
1062 _lock_file(self.f, exclusive)
1068 def __exit__(self, etype, value, traceback):
1070 _unlock_file(self.f)
1077 def write(self, *args):
1078 return self.f.write(*args)
1080 def read(self, *args):
1081 return self.f.read(*args)
1084 def get_filesystem_encoding():
1085 encoding = sys.getfilesystemencoding()
1086 return encoding if encoding is not None else 'utf-8'
1089 def shell_quote(args):
1091 encoding = get_filesystem_encoding()
1093 if isinstance(a, bytes):
1094 # We may get a filename encoded with 'encodeFilename'
1095 a = a.decode(encoding)
1096 quoted_args.append(pipes.quote(a))
1097 return ' '.join(quoted_args)
1100 def takewhile_inclusive(pred, seq):
1101 """ Like itertools.takewhile, but include the latest evaluated element
1102 (the first element so that Not pred(e)) """
1109 def smuggle_url(url, data):
1110 """ Pass additional data in a URL for internal use. """
1112 sdata = compat_urllib_parse.urlencode(
1113 {'__youtubedl_smuggle': json.dumps(data)})
1114 return url + '#' + sdata
1117 def unsmuggle_url(smug_url, default=None):
1118 if '#__youtubedl_smuggle' not in smug_url:
1119 return smug_url, default
1120 url, _, sdata = smug_url.rpartition('#')
1121 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1122 data = json.loads(jsond)
1126 def format_bytes(bytes):
1129 if type(bytes) is str:
1130 bytes = float(bytes)
1134 exponent = int(math.log(bytes, 1024.0))
1135 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1136 converted = float(bytes) / float(1024 ** exponent)
1137 return '%.2f%s' % (converted, suffix)
1140 def parse_filesize(s):
1144 # The lower-case forms are of course incorrect and inofficial,
1145 # but we support those too
1183 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1185 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1189 num_str = m.group('num').replace(',', '.')
1190 mult = _UNIT_TABLE[m.group('unit')]
1191 return int(float(num_str) * mult)
1194 def month_by_name(name):
1195 """ Return the number of a month by (locale-independently) English name """
1198 return ENGLISH_MONTH_NAMES.index(name) + 1
1203 def month_by_abbreviation(abbrev):
1204 """ Return the number of a month by (locale-independently) English
1208 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1213 def fix_xml_ampersands(xml_str):
1214 """Replace all the '&' by '&' in XML"""
1216 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1221 def setproctitle(title):
1222 assert isinstance(title, compat_str)
1224 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1227 title_bytes = title.encode('utf-8')
1228 buf = ctypes.create_string_buffer(len(title_bytes))
1229 buf.value = title_bytes
1231 libc.prctl(15, buf, 0, 0, 0)
1232 except AttributeError:
1233 return # Strange libc, just skip this
1236 def remove_start(s, start):
1237 if s.startswith(start):
1238 return s[len(start):]
1242 def remove_end(s, end):
1244 return s[:-len(end)]
1248 def url_basename(url):
1249 path = compat_urlparse.urlparse(url).path
1250 return path.strip('/').split('/')[-1]
1253 class HEADRequest(compat_urllib_request.Request):
1254 def get_method(self):
1258 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1261 v = getattr(v, get_attr, None)
1264 return default if v is None else (int(v) * invscale // scale)
1267 def str_or_none(v, default=None):
1268 return default if v is None else compat_str(v)
1271 def str_to_int(int_str):
1272 """ A more relaxed version of int_or_none """
1275 int_str = re.sub(r'[,\.\+]', '', int_str)
1279 def float_or_none(v, scale=1, invscale=1, default=None):
1280 return default if v is None else (float(v) * invscale / scale)
1283 def parse_duration(s):
1284 if not isinstance(s, compat_basestring):
1292 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1293 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1295 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1298 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1299 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1301 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1303 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1308 if m.group('only_mins'):
1309 return float_or_none(m.group('only_mins'), invscale=60)
1310 if m.group('only_hours'):
1311 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1313 res += int(m.group('secs'))
1314 if m.group('mins_reversed'):
1315 res += int(m.group('mins_reversed')) * 60
1317 res += int(m.group('mins')) * 60
1318 if m.group('hours'):
1319 res += int(m.group('hours')) * 60 * 60
1320 if m.group('hours_reversed'):
1321 res += int(m.group('hours_reversed')) * 60 * 60
1323 res += int(m.group('days')) * 24 * 60 * 60
1325 res += float(m.group('ms'))
1329 def prepend_extension(filename, ext):
1330 name, real_ext = os.path.splitext(filename)
1331 return '{0}.{1}{2}'.format(name, ext, real_ext)
1334 def check_executable(exe, args=[]):
1335 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1336 args can be a list of arguments for a short output (like -version) """
1338 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1344 def get_exe_version(exe, args=['--version'],
1345 version_re=None, unrecognized='present'):
1346 """ Returns the version of the specified executable,
1347 or False if the executable is not present """
1349 out, _ = subprocess.Popen(
1351 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1354 if isinstance(out, bytes): # Python 2.x
1355 out = out.decode('ascii', 'ignore')
1356 return detect_exe_version(out, version_re, unrecognized)
1359 def detect_exe_version(output, version_re=None, unrecognized='present'):
1360 assert isinstance(output, compat_str)
1361 if version_re is None:
1362 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1363 m = re.search(version_re, output)
1370 class PagedList(object):
1372 # This is only useful for tests
1373 return len(self.getslice())
1376 class OnDemandPagedList(PagedList):
1377 def __init__(self, pagefunc, pagesize):
1378 self._pagefunc = pagefunc
1379 self._pagesize = pagesize
1381 def getslice(self, start=0, end=None):
1383 for pagenum in itertools.count(start // self._pagesize):
1384 firstid = pagenum * self._pagesize
1385 nextfirstid = pagenum * self._pagesize + self._pagesize
1386 if start >= nextfirstid:
1389 page_results = list(self._pagefunc(pagenum))
1392 start % self._pagesize
1393 if firstid <= start < nextfirstid
1397 ((end - 1) % self._pagesize) + 1
1398 if (end is not None and firstid <= end <= nextfirstid)
1401 if startv != 0 or endv is not None:
1402 page_results = page_results[startv:endv]
1403 res.extend(page_results)
1405 # A little optimization - if current page is not "full", ie. does
1406 # not contain page_size videos then we can assume that this page
1407 # is the last one - there are no more ids on further pages -
1408 # i.e. no need to query again.
1409 if len(page_results) + startv < self._pagesize:
1412 # If we got the whole page, but the next page is not interesting,
1413 # break out early as well
1414 if end == nextfirstid:
1419 class InAdvancePagedList(PagedList):
1420 def __init__(self, pagefunc, pagecount, pagesize):
1421 self._pagefunc = pagefunc
1422 self._pagecount = pagecount
1423 self._pagesize = pagesize
1425 def getslice(self, start=0, end=None):
1427 start_page = start // self._pagesize
1429 self._pagecount if end is None else (end // self._pagesize + 1))
1430 skip_elems = start - start_page * self._pagesize
1431 only_more = None if end is None else end - start
1432 for pagenum in range(start_page, end_page):
1433 page = list(self._pagefunc(pagenum))
1435 page = page[skip_elems:]
1437 if only_more is not None:
1438 if len(page) < only_more:
1439 only_more -= len(page)
1441 page = page[:only_more]
1448 def uppercase_escape(s):
1449 unicode_escape = codecs.getdecoder('unicode_escape')
1451 r'\\U[0-9a-fA-F]{8}',
1452 lambda m: unicode_escape(m.group(0))[0],
1456 def escape_rfc3986(s):
1457 """Escape non-ASCII characters as suggested by RFC 3986"""
1458 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1459 s = s.encode('utf-8')
1460 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1463 def escape_url(url):
1464 """Escape URL as suggested by RFC 3986"""
1465 url_parsed = compat_urllib_parse_urlparse(url)
1466 return url_parsed._replace(
1467 path=escape_rfc3986(url_parsed.path),
1468 params=escape_rfc3986(url_parsed.params),
1469 query=escape_rfc3986(url_parsed.query),
1470 fragment=escape_rfc3986(url_parsed.fragment)
1474 struct.pack('!I', 0)
1476 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1477 def struct_pack(spec, *args):
1478 if isinstance(spec, compat_str):
1479 spec = spec.encode('ascii')
1480 return struct.pack(spec, *args)
1482 def struct_unpack(spec, *args):
1483 if isinstance(spec, compat_str):
1484 spec = spec.encode('ascii')
1485 return struct.unpack(spec, *args)
1487 struct_pack = struct.pack
1488 struct_unpack = struct.unpack
1491 def read_batch_urls(batch_fd):
1493 if not isinstance(url, compat_str):
1494 url = url.decode('utf-8', 'replace')
1495 BOM_UTF8 = '\xef\xbb\xbf'
1496 if url.startswith(BOM_UTF8):
1497 url = url[len(BOM_UTF8):]
1499 if url.startswith(('#', ';', ']')):
1503 with contextlib.closing(batch_fd) as fd:
1504 return [url for url in map(fixup, fd) if url]
1507 def urlencode_postdata(*args, **kargs):
1508 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1512 etree_iter = xml.etree.ElementTree.Element.iter
1513 except AttributeError: # Python <=2.6
1514 etree_iter = lambda n: n.findall('.//*')
1518 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1519 def doctype(self, name, pubid, system):
1520 pass # Ignore doctypes
1522 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1523 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1524 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1525 # Fix up XML parser in Python 2.x
1526 if sys.version_info < (3, 0):
1527 for n in etree_iter(tree):
1528 if n.text is not None:
1529 if not isinstance(n.text, compat_str):
1530 n.text = n.text.decode('utf-8')
1543 def parse_age_limit(s):
1546 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1547 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1550 def strip_jsonp(code):
1552 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1555 def js_to_json(code):
1558 if v in ('true', 'false', 'null'):
1560 if v.startswith('"'):
1562 if v.startswith("'"):
1564 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1571 res = re.sub(r'''(?x)
1572 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1573 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1574 [a-zA-Z_][.a-zA-Z_0-9]*
1576 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1580 def qualities(quality_ids):
1581 """ Get a numeric quality value out of a list of possible values """
1584 return quality_ids.index(qid)
1590 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1593 def limit_length(s, length):
1594 """ Add ellipses to overly long strings """
1599 return s[:length - len(ELLIPSES)] + ELLIPSES
1603 def version_tuple(v):
1604 return tuple(int(e) for e in re.split(r'[-.]', v))
1607 def is_outdated_version(version, limit, assume_new=True):
1609 return not assume_new
1611 return version_tuple(version) < version_tuple(limit)
1613 return not assume_new
1616 def ytdl_is_updateable():
1617 """ Returns if youtube-dl can be updated with -U """
1618 from zipimport import zipimporter
1620 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1623 def args_to_str(args):
1624 # Get a short string representation for a subprocess command
1625 return ' '.join(shlex_quote(a) for a in args)
1628 def mimetype2ext(mt):
1629 _, _, res = mt.rpartition('/')
1633 'x-mp4-fragmented': 'mp4',
1637 def urlhandle_detect_ext(url_handle):
1640 getheader = lambda h: url_handle.headers[h]
1641 except AttributeError: # Python < 3
1642 getheader = url_handle.info().getheader
1644 cd = getheader('Content-Disposition')
1646 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1648 e = determine_ext(m.group('filename'), default_ext=None)
1652 return mimetype2ext(getheader('Content-Type'))
1655 def age_restricted(content_limit, age_limit):
1656 """ Returns True iff the content should be blocked """
1658 if age_limit is None: # No limit set
1660 if content_limit is None:
1661 return False # Content available for everyone
1662 return age_limit < content_limit
1665 def is_html(first_bytes):
1666 """ Detect whether a file contains HTML by examining its first bytes. """
1669 (b'\xef\xbb\xbf', 'utf-8'),
1670 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1671 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1672 (b'\xff\xfe', 'utf-16-le'),
1673 (b'\xfe\xff', 'utf-16-be'),
1675 for bom, enc in BOMS:
1676 if first_bytes.startswith(bom):
1677 s = first_bytes[len(bom):].decode(enc, 'replace')
1680 s = first_bytes.decode('utf-8', 'replace')
1682 return re.match(r'^\s*<', s)
1685 def determine_protocol(info_dict):
1686 protocol = info_dict.get('protocol')
1687 if protocol is not None:
1690 url = info_dict['url']
1691 if url.startswith('rtmp'):
1693 elif url.startswith('mms'):
1695 elif url.startswith('rtsp'):
1698 ext = determine_ext(url)
1704 return compat_urllib_parse_urlparse(url).scheme
1707 def render_table(header_row, data):
1708 """ Render a list of rows, each as a list of values """
1709 table = [header_row] + data
1710 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1711 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1712 return '\n'.join(format_str % tuple(row) for row in table)
1715 def _match_one(filter_part, dct):
1716 COMPARISON_OPERATORS = {
1724 operator_rex = re.compile(r'''(?x)\s*
1726 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1728 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1729 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1732 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1733 m = operator_rex.search(filter_part)
1735 op = COMPARISON_OPERATORS[m.group('op')]
1736 if m.group('strval') is not None:
1737 if m.group('op') not in ('=', '!='):
1739 'Operator %s does not support string values!' % m.group('op'))
1740 comparison_value = m.group('strval')
1743 comparison_value = int(m.group('intval'))
1745 comparison_value = parse_filesize(m.group('intval'))
1746 if comparison_value is None:
1747 comparison_value = parse_filesize(m.group('intval') + 'B')
1748 if comparison_value is None:
1750 'Invalid integer value %r in filter part %r' % (
1751 m.group('intval'), filter_part))
1752 actual_value = dct.get(m.group('key'))
1753 if actual_value is None:
1754 return m.group('none_inclusive')
1755 return op(actual_value, comparison_value)
1758 '': lambda v: v is not None,
1759 '!': lambda v: v is None,
1761 operator_rex = re.compile(r'''(?x)\s*
1762 (?P<op>%s)\s*(?P<key>[a-z_]+)
1764 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1765 m = operator_rex.search(filter_part)
1767 op = UNARY_OPERATORS[m.group('op')]
1768 actual_value = dct.get(m.group('key'))
1769 return op(actual_value)
1771 raise ValueError('Invalid filter part %r' % filter_part)
1774 def match_str(filter_str, dct):
1775 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1778 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1781 def match_filter_func(filter_str):
1782 def _match_func(info_dict):
1783 if match_str(filter_str, info_dict):
1786 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1787 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1791 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1792 def __init__(self, proxies=None):
1793 # Set default handlers
1794 for type in ('http', 'https'):
1795 setattr(self, '%s_open' % type,
1796 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1797 meth(r, proxy, type))
1798 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1800 def proxy_open(self, req, proxy, type):
1801 req_proxy = req.headers.get('Ytdl-request-proxy')
1802 if req_proxy is not None:
1804 del req.headers['Ytdl-request-proxy']
1806 if proxy == '__noproxy__':
1807 return None # No Proxy
1808 return compat_urllib_request.ProxyHandler.proxy_open(
1809 self, req, proxy, type)