2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
41 compat_socket_create_connection,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
69 def preferredencoding():
70 """Get preferred encoding.
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
76 pref = locale.getpreferredencoding()
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
117 tf = tempfile.NamedTemporaryFile(**args)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
129 os.rename(tf.name, fn)
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
166 replaced.append(c[0])
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
178 if n is None or n.text is None:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
195 m = re.search(r'''(?xs)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
203 ''' % (re.escape(attribute), re.escape(value)), html)
207 res = m.group('content')
209 if res.startswith('"') or res.startswith("'"):
212 return unescapeHTML(res)
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
218 if html is None: # Convenience for sanitizing descriptions etc.
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
240 It returns the tuple (stream, definitive_file_name).
244 if sys.platform == 'win32':
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = os.path.join(
256 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
257 for path_part in os.path.split(filename)
259 if alt_filename == filename:
262 # An exception here should be caught in the caller
263 stream = open(encodeFilename(filename), open_mode)
264 return (stream, alt_filename)
267 def timeconvert(timestr):
268 """Convert RFC 2822 defined time string into system timestamp"""
270 timetuple = email.utils.parsedate_tz(timestr)
271 if timetuple is not None:
272 timestamp = email.utils.mktime_tz(timetuple)
276 def sanitize_filename(s, restricted=False, is_id=False):
277 """Sanitizes a string so it could be used as part of a filename.
278 If restricted is set, use a stricter subset of allowed characters.
279 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
281 def replace_insane(char):
282 if char == '?' or ord(char) < 32 or ord(char) == 127:
285 return '' if restricted else '\''
287 return '_-' if restricted else ' -'
288 elif char in '\\/|*<>':
290 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
292 if restricted and ord(char) > 127:
297 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
298 result = ''.join(map(replace_insane, s))
300 while '__' in result:
301 result = result.replace('__', '_')
302 result = result.strip('_')
303 # Common case of "Foreign band name - English song title"
304 if restricted and result.startswith('-_'):
306 if result.startswith('-'):
307 result = '_' + result[len('-'):]
308 result = result.lstrip('.')
314 def orderedSet(iterable):
315 """ Remove all duplicates from the input iterable """
323 def _htmlentity_transform(entity):
324 """Transforms an HTML entity to a character."""
325 # Known non-numeric HTML entity
326 if entity in compat_html_entities.name2codepoint:
327 return compat_chr(compat_html_entities.name2codepoint[entity])
329 mobj = re.match(r'#(x?[0-9]+)', entity)
331 numstr = mobj.group(1)
332 if numstr.startswith('x'):
334 numstr = '0%s' % numstr
337 return compat_chr(int(numstr, base))
339 # Unknown entity in name, return its literal representation
340 return ('&%s;' % entity)
346 assert type(s) == compat_str
349 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
352 def encodeFilename(s, for_subprocess=False):
354 @param s The name of the file
357 assert type(s) == compat_str
359 # Python 3 has a Unicode API
360 if sys.version_info >= (3, 0):
363 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
364 # Pass '' directly to use Unicode APIs on Windows 2000 and up
365 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
366 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
367 if not for_subprocess:
370 # For subprocess calls, encode with locale encoding
371 # Refer to http://stackoverflow.com/a/9951851/35070
372 encoding = preferredencoding()
374 encoding = sys.getfilesystemencoding()
377 return s.encode(encoding, 'ignore')
380 def encodeArgument(s):
381 if not isinstance(s, compat_str):
382 # Legacy code that uses byte strings
383 # Uncomment the following line after fixing all post processors
384 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
385 s = s.decode('ascii')
386 return encodeFilename(s, True)
389 def decodeOption(optval):
392 if isinstance(optval, bytes):
393 optval = optval.decode(preferredencoding())
395 assert isinstance(optval, compat_str)
399 def formatSeconds(secs):
401 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
403 return '%d:%02d' % (secs // 60, secs % 60)
408 def make_HTTPS_handler(params, **kwargs):
409 opts_no_check_certificate = params.get('nocheckcertificate', False)
410 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
411 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
412 if opts_no_check_certificate:
413 context.check_hostname = False
414 context.verify_mode = ssl.CERT_NONE
416 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
419 # (create_default_context present but HTTPSHandler has no context=)
422 if sys.version_info < (3, 2):
423 return YoutubeDLHTTPSHandler(params, **kwargs)
425 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
426 context.verify_mode = (ssl.CERT_NONE
427 if opts_no_check_certificate
428 else ssl.CERT_REQUIRED)
429 context.set_default_verify_paths()
430 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
433 class ExtractorError(Exception):
434 """Error during info extraction."""
436 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
437 """ tb, if given, is the original traceback (so that it can be printed out).
438 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
441 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
443 if video_id is not None:
444 msg = video_id + ': ' + msg
446 msg += ' (caused by %r)' % cause
448 if ytdl_is_updateable():
449 update_cmd = 'type youtube-dl -U to update'
451 update_cmd = 'see https://yt-dl.org/update on how to update'
452 msg += '; please report this issue on https://yt-dl.org/bug .'
453 msg += ' Make sure you are using the latest version; %s.' % update_cmd
454 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
455 super(ExtractorError, self).__init__(msg)
458 self.exc_info = sys.exc_info() # preserve original exception
460 self.video_id = video_id
462 def format_traceback(self):
463 if self.traceback is None:
465 return ''.join(traceback.format_tb(self.traceback))
468 class UnsupportedError(ExtractorError):
469 def __init__(self, url):
470 super(UnsupportedError, self).__init__(
471 'Unsupported URL: %s' % url, expected=True)
475 class RegexNotFoundError(ExtractorError):
476 """Error when a regex didn't match"""
480 class DownloadError(Exception):
481 """Download Error exception.
483 This exception may be thrown by FileDownloader objects if they are not
484 configured to continue on errors. They will contain the appropriate
488 def __init__(self, msg, exc_info=None):
489 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
490 super(DownloadError, self).__init__(msg)
491 self.exc_info = exc_info
494 class SameFileError(Exception):
495 """Same File exception.
497 This exception will be thrown by FileDownloader objects if they detect
498 multiple files would have to be downloaded to the same file on disk.
503 class PostProcessingError(Exception):
504 """Post Processing exception.
506 This exception may be raised by PostProcessor's .run() method to
507 indicate an error in the postprocessing task.
510 def __init__(self, msg):
514 class MaxDownloadsReached(Exception):
515 """ --max-downloads limit has been reached. """
519 class UnavailableVideoError(Exception):
520 """Unavailable Format exception.
522 This exception will be thrown when a video is requested
523 in a format that is not available for that video.
528 class ContentTooShortError(Exception):
529 """Content Too Short exception.
531 This exception may be raised by FileDownloader objects when a file they
532 download is too small for what the server announced first, indicating
533 the connection was probably interrupted.
539 def __init__(self, downloaded, expected):
540 self.downloaded = downloaded
541 self.expected = expected
544 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
545 hc = http_class(*args, **kwargs)
546 source_address = ydl_handler._params.get('source_address')
547 if source_address is not None:
548 sa = (source_address, 0)
549 if hasattr(hc, 'source_address'): # Python 2.7+
550 hc.source_address = sa
552 def _hc_connect(self, *args, **kwargs):
553 sock = compat_socket_create_connection(
554 (self.host, self.port), self.timeout, sa)
556 self.sock = ssl.wrap_socket(
557 sock, self.key_file, self.cert_file,
558 ssl_version=ssl.PROTOCOL_TLSv1)
561 hc.connect = functools.partial(_hc_connect, hc)
566 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
567 """Handler for HTTP requests and responses.
569 This class, when installed with an OpenerDirector, automatically adds
570 the standard headers to every HTTP request and handles gzipped and
571 deflated responses from web servers. If compression is to be avoided in
572 a particular request, the original request in the program code only has
573 to include the HTTP header "Youtubedl-No-Compression", which will be
574 removed before making the real request.
576 Part of this code was copied from:
578 http://techknack.net/python-urllib2-handlers/
580 Andrew Rowls, the author of that code, agreed to release it to the
584 def __init__(self, params, *args, **kwargs):
585 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
586 self._params = params
588 def http_open(self, req):
589 return self.do_open(functools.partial(
590 _create_http_connection, self, compat_http_client.HTTPConnection, False),
596 return zlib.decompress(data, -zlib.MAX_WBITS)
598 return zlib.decompress(data)
601 def addinfourl_wrapper(stream, headers, url, code):
602 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
603 return compat_urllib_request.addinfourl(stream, headers, url, code)
604 ret = compat_urllib_request.addinfourl(stream, headers, url)
608 def http_request(self, req):
609 for h, v in std_headers.items():
610 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
611 # The dict keys are capitalized because of this bug by urllib
612 if h.capitalize() not in req.headers:
614 if 'Youtubedl-no-compression' in req.headers:
615 if 'Accept-encoding' in req.headers:
616 del req.headers['Accept-encoding']
617 del req.headers['Youtubedl-no-compression']
619 if sys.version_info < (2, 7) and '#' in req.get_full_url():
620 # Python 2.6 is brain-dead when it comes to fragments
621 req._Request__original = req._Request__original.partition('#')[0]
622 req._Request__r_type = req._Request__r_type.partition('#')[0]
626 def http_response(self, req, resp):
629 if resp.headers.get('Content-encoding', '') == 'gzip':
630 content = resp.read()
631 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
633 uncompressed = io.BytesIO(gz.read())
634 except IOError as original_ioerror:
635 # There may be junk add the end of the file
636 # See http://stackoverflow.com/q/4928560/35070 for details
637 for i in range(1, 1024):
639 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
640 uncompressed = io.BytesIO(gz.read())
645 raise original_ioerror
646 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
647 resp.msg = old_resp.msg
649 if resp.headers.get('Content-encoding', '') == 'deflate':
650 gz = io.BytesIO(self.deflate(resp.read()))
651 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
652 resp.msg = old_resp.msg
655 https_request = http_request
656 https_response = http_response
659 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
660 def __init__(self, params, https_conn_class=None, *args, **kwargs):
661 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
662 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
663 self._params = params
665 def https_open(self, req):
667 if hasattr(self, '_context'): # python > 2.6
668 kwargs['context'] = self._context
669 if hasattr(self, '_check_hostname'): # python 3.x
670 kwargs['check_hostname'] = self._check_hostname
671 return self.do_open(functools.partial(
672 _create_http_connection, self, self._https_conn_class, True),
676 def parse_iso8601(date_str, delimiter='T', timezone=None):
677 """ Return a UNIX timestamp from the given date """
684 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
687 timezone = datetime.timedelta()
689 date_str = date_str[:-len(m.group(0))]
690 if not m.group('sign'):
691 timezone = datetime.timedelta()
693 sign = 1 if m.group('sign') == '+' else -1
694 timezone = datetime.timedelta(
695 hours=sign * int(m.group('hours')),
696 minutes=sign * int(m.group('minutes')))
697 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
698 dt = datetime.datetime.strptime(date_str, date_format) - timezone
699 return calendar.timegm(dt.timetuple())
702 def unified_strdate(date_str, day_first=True):
703 """Return a string with the date in the format YYYYMMDD"""
709 date_str = date_str.replace(',', ' ')
710 # %z (UTC offset) is only supported in python>=3.2
711 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
712 # Remove AM/PM + timezone
713 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
715 format_expressions = [
720 '%b %dst %Y %I:%M%p',
721 '%b %dnd %Y %I:%M%p',
722 '%b %dth %Y %I:%M%p',
728 '%Y-%m-%d %H:%M:%S.%f',
731 '%Y-%m-%dT%H:%M:%SZ',
732 '%Y-%m-%dT%H:%M:%S.%fZ',
733 '%Y-%m-%dT%H:%M:%S.%f0Z',
735 '%Y-%m-%dT%H:%M:%S.%f',
739 format_expressions.extend([
746 format_expressions.extend([
752 for expression in format_expressions:
754 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
757 if upload_date is None:
758 timetuple = email.utils.parsedate_tz(date_str)
760 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
764 def determine_ext(url, default_ext='unknown_video'):
767 guess = url.partition('?')[0].rpartition('.')[2]
768 if re.match(r'^[A-Za-z0-9]+$', guess):
774 def subtitles_filename(filename, sub_lang, sub_format):
775 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
778 def date_from_str(date_str):
780 Return a datetime object from a string in the format YYYYMMDD or
781 (now|today)[+-][0-9](day|week|month|year)(s)?"""
782 today = datetime.date.today()
783 if date_str in ('now', 'today'):
785 if date_str == 'yesterday':
786 return today - datetime.timedelta(days=1)
787 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
788 if match is not None:
789 sign = match.group('sign')
790 time = int(match.group('time'))
793 unit = match.group('unit')
794 # A bad aproximation?
802 delta = datetime.timedelta(**{unit: time})
804 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
807 def hyphenate_date(date_str):
809 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
810 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
811 if match is not None:
812 return '-'.join(match.groups())
817 class DateRange(object):
818 """Represents a time interval between two dates"""
820 def __init__(self, start=None, end=None):
821 """start and end must be strings in the format accepted by date"""
822 if start is not None:
823 self.start = date_from_str(start)
825 self.start = datetime.datetime.min.date()
827 self.end = date_from_str(end)
829 self.end = datetime.datetime.max.date()
830 if self.start > self.end:
831 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
835 """Returns a range that only contains the given day"""
838 def __contains__(self, date):
839 """Check if the date is in the range"""
840 if not isinstance(date, datetime.date):
841 date = date_from_str(date)
842 return self.start <= date <= self.end
845 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
849 """ Returns the platform name as a compat_str """
850 res = platform.platform()
851 if isinstance(res, bytes):
852 res = res.decode(preferredencoding())
854 assert isinstance(res, compat_str)
858 def _windows_write_string(s, out):
859 """ Returns True if the string was written using special methods,
860 False if it has yet to be written out."""
861 # Adapted from http://stackoverflow.com/a/3259271/35070
864 import ctypes.wintypes
872 fileno = out.fileno()
873 except AttributeError:
874 # If the output stream doesn't have a fileno, it's virtual
876 except io.UnsupportedOperation:
877 # Some strange Windows pseudo files?
879 if fileno not in WIN_OUTPUT_IDS:
882 GetStdHandle = ctypes.WINFUNCTYPE(
883 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
884 (b"GetStdHandle", ctypes.windll.kernel32))
885 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
887 WriteConsoleW = ctypes.WINFUNCTYPE(
888 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
889 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
890 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
891 written = ctypes.wintypes.DWORD(0)
893 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
894 FILE_TYPE_CHAR = 0x0002
895 FILE_TYPE_REMOTE = 0x8000
896 GetConsoleMode = ctypes.WINFUNCTYPE(
897 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
898 ctypes.POINTER(ctypes.wintypes.DWORD))(
899 (b"GetConsoleMode", ctypes.windll.kernel32))
900 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
902 def not_a_console(handle):
903 if handle == INVALID_HANDLE_VALUE or handle is None:
905 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
906 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
911 def next_nonbmp_pos(s):
913 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
914 except StopIteration:
918 count = min(next_nonbmp_pos(s), 1024)
921 h, s, count if count else 2, ctypes.byref(written), None)
923 raise OSError('Failed to write string')
924 if not count: # We just wrote a non-BMP character
925 assert written.value == 2
928 assert written.value > 0
929 s = s[written.value:]
933 def write_string(s, out=None, encoding=None):
936 assert type(s) == compat_str
938 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
939 if _windows_write_string(s, out):
942 if ('b' in getattr(out, 'mode', '') or
943 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
944 byt = s.encode(encoding or preferredencoding(), 'ignore')
946 elif hasattr(out, 'buffer'):
947 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
948 byt = s.encode(enc, 'ignore')
949 out.buffer.write(byt)
955 def bytes_to_intlist(bs):
958 if isinstance(bs[0], int): # Python 3
961 return [ord(c) for c in bs]
964 def intlist_to_bytes(xs):
967 return struct_pack('%dB' % len(xs), *xs)
970 # Cross-platform file locking
971 if sys.platform == 'win32':
972 import ctypes.wintypes
975 class OVERLAPPED(ctypes.Structure):
977 ('Internal', ctypes.wintypes.LPVOID),
978 ('InternalHigh', ctypes.wintypes.LPVOID),
979 ('Offset', ctypes.wintypes.DWORD),
980 ('OffsetHigh', ctypes.wintypes.DWORD),
981 ('hEvent', ctypes.wintypes.HANDLE),
984 kernel32 = ctypes.windll.kernel32
985 LockFileEx = kernel32.LockFileEx
986 LockFileEx.argtypes = [
987 ctypes.wintypes.HANDLE, # hFile
988 ctypes.wintypes.DWORD, # dwFlags
989 ctypes.wintypes.DWORD, # dwReserved
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
992 ctypes.POINTER(OVERLAPPED) # Overlapped
994 LockFileEx.restype = ctypes.wintypes.BOOL
995 UnlockFileEx = kernel32.UnlockFileEx
996 UnlockFileEx.argtypes = [
997 ctypes.wintypes.HANDLE, # hFile
998 ctypes.wintypes.DWORD, # dwReserved
999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1000 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1001 ctypes.POINTER(OVERLAPPED) # Overlapped
1003 UnlockFileEx.restype = ctypes.wintypes.BOOL
1004 whole_low = 0xffffffff
1005 whole_high = 0x7fffffff
1007 def _lock_file(f, exclusive):
1008 overlapped = OVERLAPPED()
1009 overlapped.Offset = 0
1010 overlapped.OffsetHigh = 0
1011 overlapped.hEvent = 0
1012 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1013 handle = msvcrt.get_osfhandle(f.fileno())
1014 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1015 whole_low, whole_high, f._lock_file_overlapped_p):
1016 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1018 def _unlock_file(f):
1019 assert f._lock_file_overlapped_p
1020 handle = msvcrt.get_osfhandle(f.fileno())
1021 if not UnlockFileEx(handle, 0,
1022 whole_low, whole_high, f._lock_file_overlapped_p):
1023 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1028 def _lock_file(f, exclusive):
1029 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1031 def _unlock_file(f):
1032 fcntl.flock(f, fcntl.LOCK_UN)
1035 class locked_file(object):
1036 def __init__(self, filename, mode, encoding=None):
1037 assert mode in ['r', 'a', 'w']
1038 self.f = io.open(filename, mode, encoding=encoding)
1041 def __enter__(self):
1042 exclusive = self.mode != 'r'
1044 _lock_file(self.f, exclusive)
1050 def __exit__(self, etype, value, traceback):
1052 _unlock_file(self.f)
1059 def write(self, *args):
1060 return self.f.write(*args)
1062 def read(self, *args):
1063 return self.f.read(*args)
1066 def get_filesystem_encoding():
1067 encoding = sys.getfilesystemencoding()
1068 return encoding if encoding is not None else 'utf-8'
1071 def shell_quote(args):
1073 encoding = get_filesystem_encoding()
1075 if isinstance(a, bytes):
1076 # We may get a filename encoded with 'encodeFilename'
1077 a = a.decode(encoding)
1078 quoted_args.append(pipes.quote(a))
1079 return ' '.join(quoted_args)
1082 def takewhile_inclusive(pred, seq):
1083 """ Like itertools.takewhile, but include the latest evaluated element
1084 (the first element so that Not pred(e)) """
1091 def smuggle_url(url, data):
1092 """ Pass additional data in a URL for internal use. """
1094 sdata = compat_urllib_parse.urlencode(
1095 {'__youtubedl_smuggle': json.dumps(data)})
1096 return url + '#' + sdata
1099 def unsmuggle_url(smug_url, default=None):
1100 if '#__youtubedl_smuggle' not in smug_url:
1101 return smug_url, default
1102 url, _, sdata = smug_url.rpartition('#')
1103 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1104 data = json.loads(jsond)
1108 def format_bytes(bytes):
1111 if type(bytes) is str:
1112 bytes = float(bytes)
1116 exponent = int(math.log(bytes, 1024.0))
1117 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1118 converted = float(bytes) / float(1024 ** exponent)
1119 return '%.2f%s' % (converted, suffix)
1122 def parse_filesize(s):
1126 # The lower-case forms are of course incorrect and inofficial,
1127 # but we support those too
1165 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1167 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1171 num_str = m.group('num').replace(',', '.')
1172 mult = _UNIT_TABLE[m.group('unit')]
1173 return int(float(num_str) * mult)
1176 def month_by_name(name):
1177 """ Return the number of a month by (locale-independently) English name """
1180 return ENGLISH_MONTH_NAMES.index(name) + 1
1185 def month_by_abbreviation(abbrev):
1186 """ Return the number of a month by (locale-independently) English
1190 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1195 def fix_xml_ampersands(xml_str):
1196 """Replace all the '&' by '&' in XML"""
1198 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1203 def setproctitle(title):
1204 assert isinstance(title, compat_str)
1206 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1209 title_bytes = title.encode('utf-8')
1210 buf = ctypes.create_string_buffer(len(title_bytes))
1211 buf.value = title_bytes
1213 libc.prctl(15, buf, 0, 0, 0)
1214 except AttributeError:
1215 return # Strange libc, just skip this
1218 def remove_start(s, start):
1219 if s.startswith(start):
1220 return s[len(start):]
1224 def remove_end(s, end):
1226 return s[:-len(end)]
1230 def url_basename(url):
1231 path = compat_urlparse.urlparse(url).path
1232 return path.strip('/').split('/')[-1]
1235 class HEADRequest(compat_urllib_request.Request):
1236 def get_method(self):
1240 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1243 v = getattr(v, get_attr, None)
1246 return default if v is None else (int(v) * invscale // scale)
1249 def str_or_none(v, default=None):
1250 return default if v is None else compat_str(v)
1253 def str_to_int(int_str):
1254 """ A more relaxed version of int_or_none """
1257 int_str = re.sub(r'[,\.\+]', '', int_str)
1261 def float_or_none(v, scale=1, invscale=1, default=None):
1262 return default if v is None else (float(v) * invscale / scale)
1265 def parse_duration(s):
1266 if not isinstance(s, compat_basestring):
1274 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1275 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1277 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1280 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1281 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1283 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1285 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1290 if m.group('only_mins'):
1291 return float_or_none(m.group('only_mins'), invscale=60)
1292 if m.group('only_hours'):
1293 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1295 res += int(m.group('secs'))
1296 if m.group('mins_reversed'):
1297 res += int(m.group('mins_reversed')) * 60
1299 res += int(m.group('mins')) * 60
1300 if m.group('hours'):
1301 res += int(m.group('hours')) * 60 * 60
1302 if m.group('hours_reversed'):
1303 res += int(m.group('hours_reversed')) * 60 * 60
1305 res += int(m.group('days')) * 24 * 60 * 60
1307 res += float(m.group('ms'))
1311 def prepend_extension(filename, ext):
1312 name, real_ext = os.path.splitext(filename)
1313 return '{0}.{1}{2}'.format(name, ext, real_ext)
1316 def check_executable(exe, args=[]):
1317 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1318 args can be a list of arguments for a short output (like -version) """
1320 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1326 def get_exe_version(exe, args=['--version'],
1327 version_re=None, unrecognized='present'):
1328 """ Returns the version of the specified executable,
1329 or False if the executable is not present """
1331 out, _ = subprocess.Popen(
1333 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1336 if isinstance(out, bytes): # Python 2.x
1337 out = out.decode('ascii', 'ignore')
1338 return detect_exe_version(out, version_re, unrecognized)
1341 def detect_exe_version(output, version_re=None, unrecognized='present'):
1342 assert isinstance(output, compat_str)
1343 if version_re is None:
1344 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1345 m = re.search(version_re, output)
1352 class PagedList(object):
1354 # This is only useful for tests
1355 return len(self.getslice())
1358 class OnDemandPagedList(PagedList):
1359 def __init__(self, pagefunc, pagesize):
1360 self._pagefunc = pagefunc
1361 self._pagesize = pagesize
1363 def getslice(self, start=0, end=None):
1365 for pagenum in itertools.count(start // self._pagesize):
1366 firstid = pagenum * self._pagesize
1367 nextfirstid = pagenum * self._pagesize + self._pagesize
1368 if start >= nextfirstid:
1371 page_results = list(self._pagefunc(pagenum))
1374 start % self._pagesize
1375 if firstid <= start < nextfirstid
1379 ((end - 1) % self._pagesize) + 1
1380 if (end is not None and firstid <= end <= nextfirstid)
1383 if startv != 0 or endv is not None:
1384 page_results = page_results[startv:endv]
1385 res.extend(page_results)
1387 # A little optimization - if current page is not "full", ie. does
1388 # not contain page_size videos then we can assume that this page
1389 # is the last one - there are no more ids on further pages -
1390 # i.e. no need to query again.
1391 if len(page_results) + startv < self._pagesize:
1394 # If we got the whole page, but the next page is not interesting,
1395 # break out early as well
1396 if end == nextfirstid:
1401 class InAdvancePagedList(PagedList):
1402 def __init__(self, pagefunc, pagecount, pagesize):
1403 self._pagefunc = pagefunc
1404 self._pagecount = pagecount
1405 self._pagesize = pagesize
1407 def getslice(self, start=0, end=None):
1409 start_page = start // self._pagesize
1411 self._pagecount if end is None else (end // self._pagesize + 1))
1412 skip_elems = start - start_page * self._pagesize
1413 only_more = None if end is None else end - start
1414 for pagenum in range(start_page, end_page):
1415 page = list(self._pagefunc(pagenum))
1417 page = page[skip_elems:]
1419 if only_more is not None:
1420 if len(page) < only_more:
1421 only_more -= len(page)
1423 page = page[:only_more]
1430 def uppercase_escape(s):
1431 unicode_escape = codecs.getdecoder('unicode_escape')
1433 r'\\U[0-9a-fA-F]{8}',
1434 lambda m: unicode_escape(m.group(0))[0],
1438 def escape_rfc3986(s):
1439 """Escape non-ASCII characters as suggested by RFC 3986"""
1440 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1441 s = s.encode('utf-8')
1442 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1445 def escape_url(url):
1446 """Escape URL as suggested by RFC 3986"""
1447 url_parsed = compat_urllib_parse_urlparse(url)
1448 return url_parsed._replace(
1449 path=escape_rfc3986(url_parsed.path),
1450 params=escape_rfc3986(url_parsed.params),
1451 query=escape_rfc3986(url_parsed.query),
1452 fragment=escape_rfc3986(url_parsed.fragment)
1456 struct.pack('!I', 0)
1458 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1459 def struct_pack(spec, *args):
1460 if isinstance(spec, compat_str):
1461 spec = spec.encode('ascii')
1462 return struct.pack(spec, *args)
1464 def struct_unpack(spec, *args):
1465 if isinstance(spec, compat_str):
1466 spec = spec.encode('ascii')
1467 return struct.unpack(spec, *args)
1469 struct_pack = struct.pack
1470 struct_unpack = struct.unpack
1473 def read_batch_urls(batch_fd):
1475 if not isinstance(url, compat_str):
1476 url = url.decode('utf-8', 'replace')
1477 BOM_UTF8 = '\xef\xbb\xbf'
1478 if url.startswith(BOM_UTF8):
1479 url = url[len(BOM_UTF8):]
1481 if url.startswith(('#', ';', ']')):
1485 with contextlib.closing(batch_fd) as fd:
1486 return [url for url in map(fixup, fd) if url]
1489 def urlencode_postdata(*args, **kargs):
1490 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1494 etree_iter = xml.etree.ElementTree.Element.iter
1495 except AttributeError: # Python <=2.6
1496 etree_iter = lambda n: n.findall('.//*')
1500 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1501 def doctype(self, name, pubid, system):
1502 pass # Ignore doctypes
1504 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1505 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1506 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1507 # Fix up XML parser in Python 2.x
1508 if sys.version_info < (3, 0):
1509 for n in etree_iter(tree):
1510 if n.text is not None:
1511 if not isinstance(n.text, compat_str):
1512 n.text = n.text.decode('utf-8')
1525 def parse_age_limit(s):
1528 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1529 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1532 def strip_jsonp(code):
1534 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1537 def js_to_json(code):
1540 if v in ('true', 'false', 'null'):
1542 if v.startswith('"'):
1544 if v.startswith("'"):
1546 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1553 res = re.sub(r'''(?x)
1554 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1555 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1556 [a-zA-Z_][.a-zA-Z_0-9]*
1558 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1562 def qualities(quality_ids):
1563 """ Get a numeric quality value out of a list of possible values """
1566 return quality_ids.index(qid)
1572 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1575 def limit_length(s, length):
1576 """ Add ellipses to overly long strings """
1581 return s[:length - len(ELLIPSES)] + ELLIPSES
1585 def version_tuple(v):
1586 return tuple(int(e) for e in re.split(r'[-.]', v))
1589 def is_outdated_version(version, limit, assume_new=True):
1591 return not assume_new
1593 return version_tuple(version) < version_tuple(limit)
1595 return not assume_new
1598 def ytdl_is_updateable():
1599 """ Returns if youtube-dl can be updated with -U """
1600 from zipimport import zipimporter
1602 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1605 def args_to_str(args):
1606 # Get a short string representation for a subprocess command
1607 return ' '.join(shlex_quote(a) for a in args)
1610 def mimetype2ext(mt):
1611 _, _, res = mt.rpartition('/')
1615 'x-mp4-fragmented': 'mp4',
1619 def urlhandle_detect_ext(url_handle):
1622 getheader = lambda h: url_handle.headers[h]
1623 except AttributeError: # Python < 3
1624 getheader = url_handle.info().getheader
1626 cd = getheader('Content-Disposition')
1628 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1630 e = determine_ext(m.group('filename'), default_ext=None)
1634 return mimetype2ext(getheader('Content-Type'))
1637 def age_restricted(content_limit, age_limit):
1638 """ Returns True iff the content should be blocked """
1640 if age_limit is None: # No limit set
1642 if content_limit is None:
1643 return False # Content available for everyone
1644 return age_limit < content_limit
1647 def is_html(first_bytes):
1648 """ Detect whether a file contains HTML by examining its first bytes. """
1651 (b'\xef\xbb\xbf', 'utf-8'),
1652 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1653 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1654 (b'\xff\xfe', 'utf-16-le'),
1655 (b'\xfe\xff', 'utf-16-be'),
1657 for bom, enc in BOMS:
1658 if first_bytes.startswith(bom):
1659 s = first_bytes[len(bom):].decode(enc, 'replace')
1662 s = first_bytes.decode('utf-8', 'replace')
1664 return re.match(r'^\s*<', s)
1667 def determine_protocol(info_dict):
1668 protocol = info_dict.get('protocol')
1669 if protocol is not None:
1672 url = info_dict['url']
1673 if url.startswith('rtmp'):
1675 elif url.startswith('mms'):
1677 elif url.startswith('rtsp'):
1680 ext = determine_ext(url)
1686 return compat_urllib_parse_urlparse(url).scheme
1689 def render_table(header_row, data):
1690 """ Render a list of rows, each as a list of values """
1691 table = [header_row] + data
1692 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1693 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1694 return '\n'.join(format_str % tuple(row) for row in table)
1697 def _match_one(filter_part, dct):
1698 COMPARISON_OPERATORS = {
1706 operator_rex = re.compile(r'''(?x)\s*
1708 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1710 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1711 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1714 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1715 m = operator_rex.search(filter_part)
1717 op = COMPARISON_OPERATORS[m.group('op')]
1718 if m.group('strval') is not None:
1719 if m.group('op') not in ('=', '!='):
1721 'Operator %s does not support string values!' % m.group('op'))
1722 comparison_value = m.group('strval')
1725 comparison_value = int(m.group('intval'))
1727 comparison_value = parse_filesize(m.group('intval'))
1728 if comparison_value is None:
1729 comparison_value = parse_filesize(m.group('intval') + 'B')
1730 if comparison_value is None:
1732 'Invalid integer value %r in filter part %r' % (
1733 m.group('intval'), filter_part))
1734 actual_value = dct.get(m.group('key'))
1735 if actual_value is None:
1736 return m.group('none_inclusive')
1737 return op(actual_value, comparison_value)
1740 '': lambda v: v is not None,
1741 '!': lambda v: v is None,
1743 operator_rex = re.compile(r'''(?x)\s*
1744 (?P<op>%s)\s*(?P<key>[a-z_]+)
1746 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1747 m = operator_rex.search(filter_part)
1749 op = UNARY_OPERATORS[m.group('op')]
1750 actual_value = dct.get(m.group('key'))
1751 return op(actual_value)
1753 raise ValueError('Invalid filter part %r' % filter_part)
1756 def match_str(filter_str, dct):
1757 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1760 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1763 def match_filter_func(filter_str):
1764 def _match_func(info_dict):
1765 if match_str(filter_str, info_dict):
1768 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1769 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1773 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1774 def __init__(self, proxies=None):
1775 # Set default handlers
1776 for type in ('http', 'https'):
1777 setattr(self, '%s_open' % type,
1778 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1779 meth(r, proxy, type))
1780 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1782 def proxy_open(self, req, proxy, type):
1783 req_proxy = req.headers.get('Ytdl-request-proxy')
1784 if req_proxy is not None:
1786 del req.headers['Ytdl-request-proxy']
1788 if proxy == '__noproxy__':
1789 return None # No Proxy
1790 return compat_urllib_request.ProxyHandler.proxy_open(
1791 self, req, proxy, type)
1794 def url_sanitize_consecutive_slashes(url):
1795 """Sanitize URLs with consecutive slashes
1797 For example, transform both
1798 http://hostname/foo//bar/filename.html
1800 http://hostname//foo/bar/filename.html
1802 http://hostname/foo/bar/filename.html
1804 parsed_url = list(compat_urlparse.urlparse(url))
1805 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
1806 return compat_urlparse.urlunparse(parsed_url)