2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
290 result = ''.join(map(replace_insane, s))
292 while '__' in result:
293 result = result.replace('__', '_')
294 result = result.strip('_')
295 # Common case of "Foreign band name - English song title"
296 if restricted and result.startswith('-_'):
303 def orderedSet(iterable):
304 """ Remove all duplicates from the input iterable """
312 def _htmlentity_transform(entity):
313 """Transforms an HTML entity to a character."""
314 # Known non-numeric HTML entity
315 if entity in compat_html_entities.name2codepoint:
316 return compat_chr(compat_html_entities.name2codepoint[entity])
318 mobj = re.match(r'#(x?[0-9]+)', entity)
320 numstr = mobj.group(1)
321 if numstr.startswith('x'):
323 numstr = '0%s' % numstr
326 return compat_chr(int(numstr, base))
328 # Unknown entity in name, return its literal representation
329 return ('&%s;' % entity)
335 assert type(s) == compat_str
338 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
341 def encodeFilename(s, for_subprocess=False):
343 @param s The name of the file
346 assert type(s) == compat_str
348 # Python 3 has a Unicode API
349 if sys.version_info >= (3, 0):
352 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
353 # Pass '' directly to use Unicode APIs on Windows 2000 and up
354 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
355 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
356 if not for_subprocess:
359 # For subprocess calls, encode with locale encoding
360 # Refer to http://stackoverflow.com/a/9951851/35070
361 encoding = preferredencoding()
363 encoding = sys.getfilesystemencoding()
366 return s.encode(encoding, 'ignore')
369 def encodeArgument(s):
370 if not isinstance(s, compat_str):
371 # Legacy code that uses byte strings
372 # Uncomment the following line after fixing all post processors
373 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
374 s = s.decode('ascii')
375 return encodeFilename(s, True)
378 def decodeOption(optval):
381 if isinstance(optval, bytes):
382 optval = optval.decode(preferredencoding())
384 assert isinstance(optval, compat_str)
388 def formatSeconds(secs):
390 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
392 return '%d:%02d' % (secs // 60, secs % 60)
397 def make_HTTPS_handler(params, **kwargs):
398 opts_no_check_certificate = params.get('nocheckcertificate', False)
399 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
400 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
401 if opts_no_check_certificate:
402 context.verify_mode = ssl.CERT_NONE
404 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
407 # (create_default_context present but HTTPSHandler has no context=)
410 if sys.version_info < (3, 2):
413 class HTTPSConnectionV3(httplib.HTTPSConnection):
414 def __init__(self, *args, **kwargs):
415 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
418 sock = socket.create_connection((self.host, self.port), self.timeout)
419 if getattr(self, '_tunnel_host', False):
423 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
425 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
427 return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
429 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
430 context.verify_mode = (ssl.CERT_NONE
431 if opts_no_check_certificate
432 else ssl.CERT_REQUIRED)
433 context.set_default_verify_paths()
434 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
437 class ExtractorError(Exception):
438 """Error during info extraction."""
440 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
441 """ tb, if given, is the original traceback (so that it can be printed out).
442 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
445 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
447 if video_id is not None:
448 msg = video_id + ': ' + msg
450 msg += ' (caused by %r)' % cause
452 if ytdl_is_updateable():
453 update_cmd = 'type youtube-dl -U to update'
455 update_cmd = 'see https://yt-dl.org/update on how to update'
456 msg += '; please report this issue on https://yt-dl.org/bug .'
457 msg += ' Make sure you are using the latest version; %s.' % update_cmd
458 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
459 super(ExtractorError, self).__init__(msg)
462 self.exc_info = sys.exc_info() # preserve original exception
464 self.video_id = video_id
466 def format_traceback(self):
467 if self.traceback is None:
469 return ''.join(traceback.format_tb(self.traceback))
472 class UnsupportedError(ExtractorError):
473 def __init__(self, url):
474 super(UnsupportedError, self).__init__(
475 'Unsupported URL: %s' % url, expected=True)
479 class RegexNotFoundError(ExtractorError):
480 """Error when a regex didn't match"""
484 class DownloadError(Exception):
485 """Download Error exception.
487 This exception may be thrown by FileDownloader objects if they are not
488 configured to continue on errors. They will contain the appropriate
492 def __init__(self, msg, exc_info=None):
493 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
494 super(DownloadError, self).__init__(msg)
495 self.exc_info = exc_info
498 class SameFileError(Exception):
499 """Same File exception.
501 This exception will be thrown by FileDownloader objects if they detect
502 multiple files would have to be downloaded to the same file on disk.
507 class PostProcessingError(Exception):
508 """Post Processing exception.
510 This exception may be raised by PostProcessor's .run() method to
511 indicate an error in the postprocessing task.
514 def __init__(self, msg):
518 class MaxDownloadsReached(Exception):
519 """ --max-downloads limit has been reached. """
523 class UnavailableVideoError(Exception):
524 """Unavailable Format exception.
526 This exception will be thrown when a video is requested
527 in a format that is not available for that video.
532 class ContentTooShortError(Exception):
533 """Content Too Short exception.
535 This exception may be raised by FileDownloader objects when a file they
536 download is too small for what the server announced first, indicating
537 the connection was probably interrupted.
543 def __init__(self, downloaded, expected):
544 self.downloaded = downloaded
545 self.expected = expected
548 def _create_http_connection(ydl_handler, http_class, is_https=False, *args, **kwargs):
549 hc = http_class(*args, **kwargs)
550 source_address = ydl_handler._params.get('source_address')
551 if source_address is not None:
552 sa = (source_address, 0)
553 if hasattr(hc, 'source_address'): # Python 2.7+
554 hc.source_address = sa
556 def _hc_connect(self, *args, **kwargs):
557 sock = compat_socket_create_connection(
558 (self.host, self.port), self.timeout, sa)
560 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
563 hc.connect = functools.partial(_hc_connect, hc)
568 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
569 """Handler for HTTP requests and responses.
571 This class, when installed with an OpenerDirector, automatically adds
572 the standard headers to every HTTP request and handles gzipped and
573 deflated responses from web servers. If compression is to be avoided in
574 a particular request, the original request in the program code only has
575 to include the HTTP header "Youtubedl-No-Compression", which will be
576 removed before making the real request.
578 Part of this code was copied from:
580 http://techknack.net/python-urllib2-handlers/
582 Andrew Rowls, the author of that code, agreed to release it to the
586 def __init__(self, params, *args, **kwargs):
587 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
588 self._params = params
590 def http_open(self, req):
591 return self.do_open(functools.partial(
592 _create_http_connection, self, compat_http_client.HTTPConnection),
598 return zlib.decompress(data, -zlib.MAX_WBITS)
600 return zlib.decompress(data)
603 def addinfourl_wrapper(stream, headers, url, code):
604 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
605 return compat_urllib_request.addinfourl(stream, headers, url, code)
606 ret = compat_urllib_request.addinfourl(stream, headers, url)
610 def http_request(self, req):
611 for h, v in std_headers.items():
612 if h not in req.headers:
614 if 'Youtubedl-no-compression' in req.headers:
615 if 'Accept-encoding' in req.headers:
616 del req.headers['Accept-encoding']
617 del req.headers['Youtubedl-no-compression']
618 if 'Youtubedl-user-agent' in req.headers:
619 if 'User-agent' in req.headers:
620 del req.headers['User-agent']
621 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
622 del req.headers['Youtubedl-user-agent']
624 if sys.version_info < (2, 7) and '#' in req.get_full_url():
625 # Python 2.6 is brain-dead when it comes to fragments
626 req._Request__original = req._Request__original.partition('#')[0]
627 req._Request__r_type = req._Request__r_type.partition('#')[0]
631 def http_response(self, req, resp):
634 if resp.headers.get('Content-encoding', '') == 'gzip':
635 content = resp.read()
636 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
638 uncompressed = io.BytesIO(gz.read())
639 except IOError as original_ioerror:
640 # There may be junk add the end of the file
641 # See http://stackoverflow.com/q/4928560/35070 for details
642 for i in range(1, 1024):
644 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
645 uncompressed = io.BytesIO(gz.read())
650 raise original_ioerror
651 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
652 resp.msg = old_resp.msg
654 if resp.headers.get('Content-encoding', '') == 'deflate':
655 gz = io.BytesIO(self.deflate(resp.read()))
656 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
657 resp.msg = old_resp.msg
660 https_request = http_request
661 https_response = http_response
664 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
665 def __init__(self, params, https_conn_class=None, *args, **kwargs):
666 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
667 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
668 self._params = params
670 def https_open(self, req):
671 return self.do_open(functools.partial(
672 _create_http_connection, self, self._https_conn_class, True),
676 def parse_iso8601(date_str, delimiter='T'):
677 """ Return a UNIX timestamp from the given date """
683 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
686 timezone = datetime.timedelta()
688 date_str = date_str[:-len(m.group(0))]
689 if not m.group('sign'):
690 timezone = datetime.timedelta()
692 sign = 1 if m.group('sign') == '+' else -1
693 timezone = datetime.timedelta(
694 hours=sign * int(m.group('hours')),
695 minutes=sign * int(m.group('minutes')))
696 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
697 dt = datetime.datetime.strptime(date_str, date_format) - timezone
698 return calendar.timegm(dt.timetuple())
701 def unified_strdate(date_str, day_first=True):
702 """Return a string with the date in the format YYYYMMDD"""
708 date_str = date_str.replace(',', ' ')
709 # %z (UTC offset) is only supported in python>=3.2
710 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
711 # Remove AM/PM + timezone
712 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
714 format_expressions = [
719 '%b %dst %Y %I:%M%p',
720 '%b %dnd %Y %I:%M%p',
721 '%b %dth %Y %I:%M%p',
726 '%Y-%m-%d %H:%M:%S.%f',
729 '%Y-%m-%dT%H:%M:%SZ',
730 '%Y-%m-%dT%H:%M:%S.%fZ',
731 '%Y-%m-%dT%H:%M:%S.%f0Z',
733 '%Y-%m-%dT%H:%M:%S.%f',
737 format_expressions.extend([
744 format_expressions.extend([
750 for expression in format_expressions:
752 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
755 if upload_date is None:
756 timetuple = email.utils.parsedate_tz(date_str)
758 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
762 def determine_ext(url, default_ext='unknown_video'):
765 guess = url.partition('?')[0].rpartition('.')[2]
766 if re.match(r'^[A-Za-z0-9]+$', guess):
772 def subtitles_filename(filename, sub_lang, sub_format):
773 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
776 def date_from_str(date_str):
778 Return a datetime object from a string in the format YYYYMMDD or
779 (now|today)[+-][0-9](day|week|month|year)(s)?"""
780 today = datetime.date.today()
781 if date_str in ('now', 'today'):
783 if date_str == 'yesterday':
784 return today - datetime.timedelta(days=1)
785 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
786 if match is not None:
787 sign = match.group('sign')
788 time = int(match.group('time'))
791 unit = match.group('unit')
792 # A bad aproximation?
800 delta = datetime.timedelta(**{unit: time})
802 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
805 def hyphenate_date(date_str):
807 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
808 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
809 if match is not None:
810 return '-'.join(match.groups())
815 class DateRange(object):
816 """Represents a time interval between two dates"""
818 def __init__(self, start=None, end=None):
819 """start and end must be strings in the format accepted by date"""
820 if start is not None:
821 self.start = date_from_str(start)
823 self.start = datetime.datetime.min.date()
825 self.end = date_from_str(end)
827 self.end = datetime.datetime.max.date()
828 if self.start > self.end:
829 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
833 """Returns a range that only contains the given day"""
836 def __contains__(self, date):
837 """Check if the date is in the range"""
838 if not isinstance(date, datetime.date):
839 date = date_from_str(date)
840 return self.start <= date <= self.end
843 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
847 """ Returns the platform name as a compat_str """
848 res = platform.platform()
849 if isinstance(res, bytes):
850 res = res.decode(preferredencoding())
852 assert isinstance(res, compat_str)
856 def _windows_write_string(s, out):
857 """ Returns True if the string was written using special methods,
858 False if it has yet to be written out."""
859 # Adapted from http://stackoverflow.com/a/3259271/35070
862 import ctypes.wintypes
870 fileno = out.fileno()
871 except AttributeError:
872 # If the output stream doesn't have a fileno, it's virtual
874 if fileno not in WIN_OUTPUT_IDS:
877 GetStdHandle = ctypes.WINFUNCTYPE(
878 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
879 (b"GetStdHandle", ctypes.windll.kernel32))
880 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
882 WriteConsoleW = ctypes.WINFUNCTYPE(
883 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
884 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
885 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
886 written = ctypes.wintypes.DWORD(0)
888 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
889 FILE_TYPE_CHAR = 0x0002
890 FILE_TYPE_REMOTE = 0x8000
891 GetConsoleMode = ctypes.WINFUNCTYPE(
892 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
893 ctypes.POINTER(ctypes.wintypes.DWORD))(
894 (b"GetConsoleMode", ctypes.windll.kernel32))
895 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
897 def not_a_console(handle):
898 if handle == INVALID_HANDLE_VALUE or handle is None:
900 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
901 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
906 def next_nonbmp_pos(s):
908 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
909 except StopIteration:
913 count = min(next_nonbmp_pos(s), 1024)
916 h, s, count if count else 2, ctypes.byref(written), None)
918 raise OSError('Failed to write string')
919 if not count: # We just wrote a non-BMP character
920 assert written.value == 2
923 assert written.value > 0
924 s = s[written.value:]
928 def write_string(s, out=None, encoding=None):
931 assert type(s) == compat_str
933 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
934 if _windows_write_string(s, out):
937 if ('b' in getattr(out, 'mode', '') or
938 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
939 byt = s.encode(encoding or preferredencoding(), 'ignore')
941 elif hasattr(out, 'buffer'):
942 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
943 byt = s.encode(enc, 'ignore')
944 out.buffer.write(byt)
950 def bytes_to_intlist(bs):
953 if isinstance(bs[0], int): # Python 3
956 return [ord(c) for c in bs]
959 def intlist_to_bytes(xs):
962 return struct_pack('%dB' % len(xs), *xs)
965 # Cross-platform file locking
966 if sys.platform == 'win32':
967 import ctypes.wintypes
970 class OVERLAPPED(ctypes.Structure):
972 ('Internal', ctypes.wintypes.LPVOID),
973 ('InternalHigh', ctypes.wintypes.LPVOID),
974 ('Offset', ctypes.wintypes.DWORD),
975 ('OffsetHigh', ctypes.wintypes.DWORD),
976 ('hEvent', ctypes.wintypes.HANDLE),
979 kernel32 = ctypes.windll.kernel32
980 LockFileEx = kernel32.LockFileEx
981 LockFileEx.argtypes = [
982 ctypes.wintypes.HANDLE, # hFile
983 ctypes.wintypes.DWORD, # dwFlags
984 ctypes.wintypes.DWORD, # dwReserved
985 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
986 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
987 ctypes.POINTER(OVERLAPPED) # Overlapped
989 LockFileEx.restype = ctypes.wintypes.BOOL
990 UnlockFileEx = kernel32.UnlockFileEx
991 UnlockFileEx.argtypes = [
992 ctypes.wintypes.HANDLE, # hFile
993 ctypes.wintypes.DWORD, # dwReserved
994 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
995 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
996 ctypes.POINTER(OVERLAPPED) # Overlapped
998 UnlockFileEx.restype = ctypes.wintypes.BOOL
999 whole_low = 0xffffffff
1000 whole_high = 0x7fffffff
1002 def _lock_file(f, exclusive):
1003 overlapped = OVERLAPPED()
1004 overlapped.Offset = 0
1005 overlapped.OffsetHigh = 0
1006 overlapped.hEvent = 0
1007 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1008 handle = msvcrt.get_osfhandle(f.fileno())
1009 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1010 whole_low, whole_high, f._lock_file_overlapped_p):
1011 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1013 def _unlock_file(f):
1014 assert f._lock_file_overlapped_p
1015 handle = msvcrt.get_osfhandle(f.fileno())
1016 if not UnlockFileEx(handle, 0,
1017 whole_low, whole_high, f._lock_file_overlapped_p):
1018 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1023 def _lock_file(f, exclusive):
1024 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1026 def _unlock_file(f):
1027 fcntl.flock(f, fcntl.LOCK_UN)
1030 class locked_file(object):
1031 def __init__(self, filename, mode, encoding=None):
1032 assert mode in ['r', 'a', 'w']
1033 self.f = io.open(filename, mode, encoding=encoding)
1036 def __enter__(self):
1037 exclusive = self.mode != 'r'
1039 _lock_file(self.f, exclusive)
1045 def __exit__(self, etype, value, traceback):
1047 _unlock_file(self.f)
1054 def write(self, *args):
1055 return self.f.write(*args)
1057 def read(self, *args):
1058 return self.f.read(*args)
1061 def get_filesystem_encoding():
1062 encoding = sys.getfilesystemencoding()
1063 return encoding if encoding is not None else 'utf-8'
1066 def shell_quote(args):
1068 encoding = get_filesystem_encoding()
1070 if isinstance(a, bytes):
1071 # We may get a filename encoded with 'encodeFilename'
1072 a = a.decode(encoding)
1073 quoted_args.append(pipes.quote(a))
1074 return ' '.join(quoted_args)
1077 def takewhile_inclusive(pred, seq):
1078 """ Like itertools.takewhile, but include the latest evaluated element
1079 (the first element so that Not pred(e)) """
1086 def smuggle_url(url, data):
1087 """ Pass additional data in a URL for internal use. """
1089 sdata = compat_urllib_parse.urlencode(
1090 {'__youtubedl_smuggle': json.dumps(data)})
1091 return url + '#' + sdata
1094 def unsmuggle_url(smug_url, default=None):
1095 if '#__youtubedl_smuggle' not in smug_url:
1096 return smug_url, default
1097 url, _, sdata = smug_url.rpartition('#')
1098 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1099 data = json.loads(jsond)
1103 def format_bytes(bytes):
1106 if type(bytes) is str:
1107 bytes = float(bytes)
1111 exponent = int(math.log(bytes, 1024.0))
1112 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1113 converted = float(bytes) / float(1024 ** exponent)
1114 return '%.2f%s' % (converted, suffix)
1117 def parse_filesize(s):
1121 # The lower-case forms are of course incorrect and inofficial,
1122 # but we support those too
1160 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1162 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1166 num_str = m.group('num').replace(',', '.')
1167 mult = _UNIT_TABLE[m.group('unit')]
1168 return int(float(num_str) * mult)
1171 def get_term_width():
1172 columns = compat_getenv('COLUMNS', None)
1177 sp = subprocess.Popen(
1179 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1180 out, err = sp.communicate()
1181 return int(out.split()[1])
1187 def month_by_name(name):
1188 """ Return the number of a month by (locale-independently) English name """
1191 'January', 'February', 'March', 'April', 'May', 'June',
1192 'July', 'August', 'September', 'October', 'November', 'December']
1194 return ENGLISH_NAMES.index(name) + 1
1199 def fix_xml_ampersands(xml_str):
1200 """Replace all the '&' by '&' in XML"""
1202 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1207 def setproctitle(title):
1208 assert isinstance(title, compat_str)
1210 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1213 title_bytes = title.encode('utf-8')
1214 buf = ctypes.create_string_buffer(len(title_bytes))
1215 buf.value = title_bytes
1217 libc.prctl(15, buf, 0, 0, 0)
1218 except AttributeError:
1219 return # Strange libc, just skip this
1222 def remove_start(s, start):
1223 if s.startswith(start):
1224 return s[len(start):]
1228 def remove_end(s, end):
1230 return s[:-len(end)]
1234 def url_basename(url):
1235 path = compat_urlparse.urlparse(url).path
1236 return path.strip('/').split('/')[-1]
1239 class HEADRequest(compat_urllib_request.Request):
1240 def get_method(self):
1244 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1247 v = getattr(v, get_attr, None)
1250 return default if v is None else (int(v) * invscale // scale)
1253 def str_or_none(v, default=None):
1254 return default if v is None else compat_str(v)
1257 def str_to_int(int_str):
1258 """ A more relaxed version of int_or_none """
1261 int_str = re.sub(r'[,\.\+]', '', int_str)
1265 def float_or_none(v, scale=1, invscale=1, default=None):
1266 return default if v is None else (float(v) * invscale / scale)
1269 def parse_duration(s):
1278 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1279 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1282 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1283 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1285 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1290 if m.group('only_mins'):
1291 return float_or_none(m.group('only_mins'), invscale=60)
1292 if m.group('only_hours'):
1293 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1295 res += int(m.group('secs'))
1297 res += int(m.group('mins')) * 60
1298 if m.group('hours'):
1299 res += int(m.group('hours')) * 60 * 60
1301 res += float(m.group('ms'))
1305 def prepend_extension(filename, ext):
1306 name, real_ext = os.path.splitext(filename)
1307 return '{0}.{1}{2}'.format(name, ext, real_ext)
1310 def check_executable(exe, args=[]):
1311 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1312 args can be a list of arguments for a short output (like -version) """
1314 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1320 def get_exe_version(exe, args=['--version'],
1321 version_re=None, unrecognized='present'):
1322 """ Returns the version of the specified executable,
1323 or False if the executable is not present """
1325 out, _ = subprocess.Popen(
1327 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1330 if isinstance(out, bytes): # Python 2.x
1331 out = out.decode('ascii', 'ignore')
1332 return detect_exe_version(out, version_re, unrecognized)
1335 def detect_exe_version(output, version_re=None, unrecognized='present'):
1336 assert isinstance(output, compat_str)
1337 if version_re is None:
1338 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1339 m = re.search(version_re, output)
1346 class PagedList(object):
1348 # This is only useful for tests
1349 return len(self.getslice())
1352 class OnDemandPagedList(PagedList):
1353 def __init__(self, pagefunc, pagesize):
1354 self._pagefunc = pagefunc
1355 self._pagesize = pagesize
1357 def getslice(self, start=0, end=None):
1359 for pagenum in itertools.count(start // self._pagesize):
1360 firstid = pagenum * self._pagesize
1361 nextfirstid = pagenum * self._pagesize + self._pagesize
1362 if start >= nextfirstid:
1365 page_results = list(self._pagefunc(pagenum))
1368 start % self._pagesize
1369 if firstid <= start < nextfirstid
1373 ((end - 1) % self._pagesize) + 1
1374 if (end is not None and firstid <= end <= nextfirstid)
1377 if startv != 0 or endv is not None:
1378 page_results = page_results[startv:endv]
1379 res.extend(page_results)
1381 # A little optimization - if current page is not "full", ie. does
1382 # not contain page_size videos then we can assume that this page
1383 # is the last one - there are no more ids on further pages -
1384 # i.e. no need to query again.
1385 if len(page_results) + startv < self._pagesize:
1388 # If we got the whole page, but the next page is not interesting,
1389 # break out early as well
1390 if end == nextfirstid:
1395 class InAdvancePagedList(PagedList):
1396 def __init__(self, pagefunc, pagecount, pagesize):
1397 self._pagefunc = pagefunc
1398 self._pagecount = pagecount
1399 self._pagesize = pagesize
1401 def getslice(self, start=0, end=None):
1403 start_page = start // self._pagesize
1405 self._pagecount if end is None else (end // self._pagesize + 1))
1406 skip_elems = start - start_page * self._pagesize
1407 only_more = None if end is None else end - start
1408 for pagenum in range(start_page, end_page):
1409 page = list(self._pagefunc(pagenum))
1411 page = page[skip_elems:]
1413 if only_more is not None:
1414 if len(page) < only_more:
1415 only_more -= len(page)
1417 page = page[:only_more]
1424 def uppercase_escape(s):
1425 unicode_escape = codecs.getdecoder('unicode_escape')
1427 r'\\U[0-9a-fA-F]{8}',
1428 lambda m: unicode_escape(m.group(0))[0],
1432 def escape_rfc3986(s):
1433 """Escape non-ASCII characters as suggested by RFC 3986"""
1434 if sys.version_info < (3, 0) and isinstance(s, unicode):
1435 s = s.encode('utf-8')
1436 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1439 def escape_url(url):
1440 """Escape URL as suggested by RFC 3986"""
1441 url_parsed = compat_urllib_parse_urlparse(url)
1442 return url_parsed._replace(
1443 path=escape_rfc3986(url_parsed.path),
1444 params=escape_rfc3986(url_parsed.params),
1445 query=escape_rfc3986(url_parsed.query),
1446 fragment=escape_rfc3986(url_parsed.fragment)
1450 struct.pack('!I', 0)
1452 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1453 def struct_pack(spec, *args):
1454 if isinstance(spec, compat_str):
1455 spec = spec.encode('ascii')
1456 return struct.pack(spec, *args)
1458 def struct_unpack(spec, *args):
1459 if isinstance(spec, compat_str):
1460 spec = spec.encode('ascii')
1461 return struct.unpack(spec, *args)
1463 struct_pack = struct.pack
1464 struct_unpack = struct.unpack
1467 def read_batch_urls(batch_fd):
1469 if not isinstance(url, compat_str):
1470 url = url.decode('utf-8', 'replace')
1471 BOM_UTF8 = '\xef\xbb\xbf'
1472 if url.startswith(BOM_UTF8):
1473 url = url[len(BOM_UTF8):]
1475 if url.startswith(('#', ';', ']')):
1479 with contextlib.closing(batch_fd) as fd:
1480 return [url for url in map(fixup, fd) if url]
1483 def urlencode_postdata(*args, **kargs):
1484 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1488 etree_iter = xml.etree.ElementTree.Element.iter
1489 except AttributeError: # Python <=2.6
1490 etree_iter = lambda n: n.findall('.//*')
1494 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1495 def doctype(self, name, pubid, system):
1496 pass # Ignore doctypes
1498 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1499 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1500 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1501 # Fix up XML parser in Python 2.x
1502 if sys.version_info < (3, 0):
1503 for n in etree_iter(tree):
1504 if n.text is not None:
1505 if not isinstance(n.text, compat_str):
1506 n.text = n.text.decode('utf-8')
1519 def parse_age_limit(s):
1522 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1523 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1526 def strip_jsonp(code):
1528 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1531 def js_to_json(code):
1534 if v in ('true', 'false', 'null'):
1536 if v.startswith('"'):
1538 if v.startswith("'"):
1540 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1547 res = re.sub(r'''(?x)
1548 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1549 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1550 [a-zA-Z_][a-zA-Z_0-9]*
1552 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1556 def qualities(quality_ids):
1557 """ Get a numeric quality value out of a list of possible values """
1560 return quality_ids.index(qid)
1566 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1569 def limit_length(s, length):
1570 """ Add ellipses to overly long strings """
1575 return s[:length - len(ELLIPSES)] + ELLIPSES
1579 def version_tuple(v):
1580 return tuple(int(e) for e in re.split(r'[-.]', v))
1583 def is_outdated_version(version, limit, assume_new=True):
1585 return not assume_new
1587 return version_tuple(version) < version_tuple(limit)
1589 return not assume_new
1592 def ytdl_is_updateable():
1593 """ Returns if youtube-dl can be updated with -U """
1594 from zipimport import zipimporter
1596 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1599 def args_to_str(args):
1600 # Get a short string representation for a subprocess command
1601 return ' '.join(shlex_quote(a) for a in args)
1604 def urlhandle_detect_ext(url_handle):
1607 getheader = lambda h: url_handle.headers[h]
1608 except AttributeError: # Python < 3
1609 getheader = url_handle.info().getheader
1611 return getheader('Content-Type').split("/")[1]
1614 def age_restricted(content_limit, age_limit):
1615 """ Returns True iff the content should be blocked """
1617 if age_limit is None: # No limit set
1619 if content_limit is None:
1620 return False # Content available for everyone
1621 return age_limit < content_limit