2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
320 mobj = re.match(r'#(x?[0-9]+)', entity)
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
325 numstr = '0%s' % numstr
328 return compat_chr(int(numstr, base))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
337 assert type(s) == compat_str
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343 def encodeFilename(s, for_subprocess=False):
345 @param s The name of the file
348 assert type(s) == compat_str
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
365 encoding = sys.getfilesystemencoding()
368 return s.encode(encoding, 'ignore')
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
380 def decodeOption(optval):
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
386 assert isinstance(optval, compat_str)
390 def formatSeconds(secs):
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 return '%d:%02d' % (secs // 60, secs % 60)
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
403 if opts_no_check_certificate:
404 context.verify_mode = ssl.CERT_NONE
406 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
409 # (create_default_context present but HTTPSHandler has no context=)
412 if sys.version_info < (3, 2):
415 class HTTPSConnectionV3(httplib.HTTPSConnection):
416 def __init__(self, *args, **kwargs):
417 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
420 sock = socket.create_connection((self.host, self.port), self.timeout)
421 if getattr(self, '_tunnel_host', False):
425 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
427 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
429 return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
431 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
432 context.verify_mode = (ssl.CERT_NONE
433 if opts_no_check_certificate
434 else ssl.CERT_REQUIRED)
435 context.set_default_verify_paths()
436 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
439 class ExtractorError(Exception):
440 """Error during info extraction."""
442 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
443 """ tb, if given, is the original traceback (so that it can be printed out).
444 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
447 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
449 if video_id is not None:
450 msg = video_id + ': ' + msg
452 msg += ' (caused by %r)' % cause
454 if ytdl_is_updateable():
455 update_cmd = 'type youtube-dl -U to update'
457 update_cmd = 'see https://yt-dl.org/update on how to update'
458 msg += '; please report this issue on https://yt-dl.org/bug .'
459 msg += ' Make sure you are using the latest version; %s.' % update_cmd
460 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
461 super(ExtractorError, self).__init__(msg)
464 self.exc_info = sys.exc_info() # preserve original exception
466 self.video_id = video_id
468 def format_traceback(self):
469 if self.traceback is None:
471 return ''.join(traceback.format_tb(self.traceback))
474 class UnsupportedError(ExtractorError):
475 def __init__(self, url):
476 super(UnsupportedError, self).__init__(
477 'Unsupported URL: %s' % url, expected=True)
481 class RegexNotFoundError(ExtractorError):
482 """Error when a regex didn't match"""
486 class DownloadError(Exception):
487 """Download Error exception.
489 This exception may be thrown by FileDownloader objects if they are not
490 configured to continue on errors. They will contain the appropriate
494 def __init__(self, msg, exc_info=None):
495 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
496 super(DownloadError, self).__init__(msg)
497 self.exc_info = exc_info
500 class SameFileError(Exception):
501 """Same File exception.
503 This exception will be thrown by FileDownloader objects if they detect
504 multiple files would have to be downloaded to the same file on disk.
509 class PostProcessingError(Exception):
510 """Post Processing exception.
512 This exception may be raised by PostProcessor's .run() method to
513 indicate an error in the postprocessing task.
516 def __init__(self, msg):
520 class MaxDownloadsReached(Exception):
521 """ --max-downloads limit has been reached. """
525 class UnavailableVideoError(Exception):
526 """Unavailable Format exception.
528 This exception will be thrown when a video is requested
529 in a format that is not available for that video.
534 class ContentTooShortError(Exception):
535 """Content Too Short exception.
537 This exception may be raised by FileDownloader objects when a file they
538 download is too small for what the server announced first, indicating
539 the connection was probably interrupted.
545 def __init__(self, downloaded, expected):
546 self.downloaded = downloaded
547 self.expected = expected
550 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
551 hc = http_class(*args, **kwargs)
552 source_address = ydl_handler._params.get('source_address')
553 if source_address is not None:
554 sa = (source_address, 0)
555 if hasattr(hc, 'source_address'): # Python 2.7+
556 hc.source_address = sa
558 def _hc_connect(self, *args, **kwargs):
559 sock = compat_socket_create_connection(
560 (self.host, self.port), self.timeout, sa)
562 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
565 hc.connect = functools.partial(_hc_connect, hc)
570 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
571 """Handler for HTTP requests and responses.
573 This class, when installed with an OpenerDirector, automatically adds
574 the standard headers to every HTTP request and handles gzipped and
575 deflated responses from web servers. If compression is to be avoided in
576 a particular request, the original request in the program code only has
577 to include the HTTP header "Youtubedl-No-Compression", which will be
578 removed before making the real request.
580 Part of this code was copied from:
582 http://techknack.net/python-urllib2-handlers/
584 Andrew Rowls, the author of that code, agreed to release it to the
588 def __init__(self, params, *args, **kwargs):
589 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
590 self._params = params
592 def http_open(self, req):
593 return self.do_open(functools.partial(
594 _create_http_connection, self, compat_http_client.HTTPConnection, False),
600 return zlib.decompress(data, -zlib.MAX_WBITS)
602 return zlib.decompress(data)
605 def addinfourl_wrapper(stream, headers, url, code):
606 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
607 return compat_urllib_request.addinfourl(stream, headers, url, code)
608 ret = compat_urllib_request.addinfourl(stream, headers, url)
612 def http_request(self, req):
613 for h, v in std_headers.items():
614 if h not in req.headers:
616 if 'Youtubedl-no-compression' in req.headers:
617 if 'Accept-encoding' in req.headers:
618 del req.headers['Accept-encoding']
619 del req.headers['Youtubedl-no-compression']
620 if 'Youtubedl-user-agent' in req.headers:
621 if 'User-agent' in req.headers:
622 del req.headers['User-agent']
623 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
624 del req.headers['Youtubedl-user-agent']
626 if sys.version_info < (2, 7) and '#' in req.get_full_url():
627 # Python 2.6 is brain-dead when it comes to fragments
628 req._Request__original = req._Request__original.partition('#')[0]
629 req._Request__r_type = req._Request__r_type.partition('#')[0]
633 def http_response(self, req, resp):
636 if resp.headers.get('Content-encoding', '') == 'gzip':
637 content = resp.read()
638 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
640 uncompressed = io.BytesIO(gz.read())
641 except IOError as original_ioerror:
642 # There may be junk add the end of the file
643 # See http://stackoverflow.com/q/4928560/35070 for details
644 for i in range(1, 1024):
646 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
647 uncompressed = io.BytesIO(gz.read())
652 raise original_ioerror
653 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
654 resp.msg = old_resp.msg
656 if resp.headers.get('Content-encoding', '') == 'deflate':
657 gz = io.BytesIO(self.deflate(resp.read()))
658 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
659 resp.msg = old_resp.msg
662 https_request = http_request
663 https_response = http_response
666 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
667 def __init__(self, params, https_conn_class=None, *args, **kwargs):
668 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
669 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
670 self._params = params
672 def https_open(self, req):
673 return self.do_open(functools.partial(
674 _create_http_connection, self, self._https_conn_class, True),
678 def parse_iso8601(date_str, delimiter='T'):
679 """ Return a UNIX timestamp from the given date """
685 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
688 timezone = datetime.timedelta()
690 date_str = date_str[:-len(m.group(0))]
691 if not m.group('sign'):
692 timezone = datetime.timedelta()
694 sign = 1 if m.group('sign') == '+' else -1
695 timezone = datetime.timedelta(
696 hours=sign * int(m.group('hours')),
697 minutes=sign * int(m.group('minutes')))
698 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
699 dt = datetime.datetime.strptime(date_str, date_format) - timezone
700 return calendar.timegm(dt.timetuple())
703 def unified_strdate(date_str, day_first=True):
704 """Return a string with the date in the format YYYYMMDD"""
710 date_str = date_str.replace(',', ' ')
711 # %z (UTC offset) is only supported in python>=3.2
712 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
713 # Remove AM/PM + timezone
714 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
716 format_expressions = [
721 '%b %dst %Y %I:%M%p',
722 '%b %dnd %Y %I:%M%p',
723 '%b %dth %Y %I:%M%p',
729 '%Y-%m-%d %H:%M:%S.%f',
732 '%Y-%m-%dT%H:%M:%SZ',
733 '%Y-%m-%dT%H:%M:%S.%fZ',
734 '%Y-%m-%dT%H:%M:%S.%f0Z',
736 '%Y-%m-%dT%H:%M:%S.%f',
740 format_expressions.extend([
747 format_expressions.extend([
753 for expression in format_expressions:
755 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
758 if upload_date is None:
759 timetuple = email.utils.parsedate_tz(date_str)
761 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
765 def determine_ext(url, default_ext='unknown_video'):
768 guess = url.partition('?')[0].rpartition('.')[2]
769 if re.match(r'^[A-Za-z0-9]+$', guess):
775 def subtitles_filename(filename, sub_lang, sub_format):
776 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
779 def date_from_str(date_str):
781 Return a datetime object from a string in the format YYYYMMDD or
782 (now|today)[+-][0-9](day|week|month|year)(s)?"""
783 today = datetime.date.today()
784 if date_str in ('now', 'today'):
786 if date_str == 'yesterday':
787 return today - datetime.timedelta(days=1)
788 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
789 if match is not None:
790 sign = match.group('sign')
791 time = int(match.group('time'))
794 unit = match.group('unit')
795 # A bad aproximation?
803 delta = datetime.timedelta(**{unit: time})
805 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
808 def hyphenate_date(date_str):
810 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
811 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
812 if match is not None:
813 return '-'.join(match.groups())
818 class DateRange(object):
819 """Represents a time interval between two dates"""
821 def __init__(self, start=None, end=None):
822 """start and end must be strings in the format accepted by date"""
823 if start is not None:
824 self.start = date_from_str(start)
826 self.start = datetime.datetime.min.date()
828 self.end = date_from_str(end)
830 self.end = datetime.datetime.max.date()
831 if self.start > self.end:
832 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
836 """Returns a range that only contains the given day"""
839 def __contains__(self, date):
840 """Check if the date is in the range"""
841 if not isinstance(date, datetime.date):
842 date = date_from_str(date)
843 return self.start <= date <= self.end
846 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
850 """ Returns the platform name as a compat_str """
851 res = platform.platform()
852 if isinstance(res, bytes):
853 res = res.decode(preferredencoding())
855 assert isinstance(res, compat_str)
859 def _windows_write_string(s, out):
860 """ Returns True if the string was written using special methods,
861 False if it has yet to be written out."""
862 # Adapted from http://stackoverflow.com/a/3259271/35070
865 import ctypes.wintypes
873 fileno = out.fileno()
874 except AttributeError:
875 # If the output stream doesn't have a fileno, it's virtual
877 if fileno not in WIN_OUTPUT_IDS:
880 GetStdHandle = ctypes.WINFUNCTYPE(
881 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
882 (b"GetStdHandle", ctypes.windll.kernel32))
883 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
885 WriteConsoleW = ctypes.WINFUNCTYPE(
886 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
887 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
888 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
889 written = ctypes.wintypes.DWORD(0)
891 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
892 FILE_TYPE_CHAR = 0x0002
893 FILE_TYPE_REMOTE = 0x8000
894 GetConsoleMode = ctypes.WINFUNCTYPE(
895 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
896 ctypes.POINTER(ctypes.wintypes.DWORD))(
897 (b"GetConsoleMode", ctypes.windll.kernel32))
898 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
900 def not_a_console(handle):
901 if handle == INVALID_HANDLE_VALUE or handle is None:
903 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
904 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
909 def next_nonbmp_pos(s):
911 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
912 except StopIteration:
916 count = min(next_nonbmp_pos(s), 1024)
919 h, s, count if count else 2, ctypes.byref(written), None)
921 raise OSError('Failed to write string')
922 if not count: # We just wrote a non-BMP character
923 assert written.value == 2
926 assert written.value > 0
927 s = s[written.value:]
931 def write_string(s, out=None, encoding=None):
934 assert type(s) == compat_str
936 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
937 if _windows_write_string(s, out):
940 if ('b' in getattr(out, 'mode', '') or
941 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
942 byt = s.encode(encoding or preferredencoding(), 'ignore')
944 elif hasattr(out, 'buffer'):
945 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
946 byt = s.encode(enc, 'ignore')
947 out.buffer.write(byt)
953 def bytes_to_intlist(bs):
956 if isinstance(bs[0], int): # Python 3
959 return [ord(c) for c in bs]
962 def intlist_to_bytes(xs):
965 return struct_pack('%dB' % len(xs), *xs)
968 # Cross-platform file locking
969 if sys.platform == 'win32':
970 import ctypes.wintypes
973 class OVERLAPPED(ctypes.Structure):
975 ('Internal', ctypes.wintypes.LPVOID),
976 ('InternalHigh', ctypes.wintypes.LPVOID),
977 ('Offset', ctypes.wintypes.DWORD),
978 ('OffsetHigh', ctypes.wintypes.DWORD),
979 ('hEvent', ctypes.wintypes.HANDLE),
982 kernel32 = ctypes.windll.kernel32
983 LockFileEx = kernel32.LockFileEx
984 LockFileEx.argtypes = [
985 ctypes.wintypes.HANDLE, # hFile
986 ctypes.wintypes.DWORD, # dwFlags
987 ctypes.wintypes.DWORD, # dwReserved
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
990 ctypes.POINTER(OVERLAPPED) # Overlapped
992 LockFileEx.restype = ctypes.wintypes.BOOL
993 UnlockFileEx = kernel32.UnlockFileEx
994 UnlockFileEx.argtypes = [
995 ctypes.wintypes.HANDLE, # hFile
996 ctypes.wintypes.DWORD, # dwReserved
997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
999 ctypes.POINTER(OVERLAPPED) # Overlapped
1001 UnlockFileEx.restype = ctypes.wintypes.BOOL
1002 whole_low = 0xffffffff
1003 whole_high = 0x7fffffff
1005 def _lock_file(f, exclusive):
1006 overlapped = OVERLAPPED()
1007 overlapped.Offset = 0
1008 overlapped.OffsetHigh = 0
1009 overlapped.hEvent = 0
1010 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1011 handle = msvcrt.get_osfhandle(f.fileno())
1012 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1013 whole_low, whole_high, f._lock_file_overlapped_p):
1014 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1016 def _unlock_file(f):
1017 assert f._lock_file_overlapped_p
1018 handle = msvcrt.get_osfhandle(f.fileno())
1019 if not UnlockFileEx(handle, 0,
1020 whole_low, whole_high, f._lock_file_overlapped_p):
1021 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1026 def _lock_file(f, exclusive):
1027 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1029 def _unlock_file(f):
1030 fcntl.flock(f, fcntl.LOCK_UN)
1033 class locked_file(object):
1034 def __init__(self, filename, mode, encoding=None):
1035 assert mode in ['r', 'a', 'w']
1036 self.f = io.open(filename, mode, encoding=encoding)
1039 def __enter__(self):
1040 exclusive = self.mode != 'r'
1042 _lock_file(self.f, exclusive)
1048 def __exit__(self, etype, value, traceback):
1050 _unlock_file(self.f)
1057 def write(self, *args):
1058 return self.f.write(*args)
1060 def read(self, *args):
1061 return self.f.read(*args)
1064 def get_filesystem_encoding():
1065 encoding = sys.getfilesystemencoding()
1066 return encoding if encoding is not None else 'utf-8'
1069 def shell_quote(args):
1071 encoding = get_filesystem_encoding()
1073 if isinstance(a, bytes):
1074 # We may get a filename encoded with 'encodeFilename'
1075 a = a.decode(encoding)
1076 quoted_args.append(pipes.quote(a))
1077 return ' '.join(quoted_args)
1080 def takewhile_inclusive(pred, seq):
1081 """ Like itertools.takewhile, but include the latest evaluated element
1082 (the first element so that Not pred(e)) """
1089 def smuggle_url(url, data):
1090 """ Pass additional data in a URL for internal use. """
1092 sdata = compat_urllib_parse.urlencode(
1093 {'__youtubedl_smuggle': json.dumps(data)})
1094 return url + '#' + sdata
1097 def unsmuggle_url(smug_url, default=None):
1098 if '#__youtubedl_smuggle' not in smug_url:
1099 return smug_url, default
1100 url, _, sdata = smug_url.rpartition('#')
1101 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1102 data = json.loads(jsond)
1106 def format_bytes(bytes):
1109 if type(bytes) is str:
1110 bytes = float(bytes)
1114 exponent = int(math.log(bytes, 1024.0))
1115 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1116 converted = float(bytes) / float(1024 ** exponent)
1117 return '%.2f%s' % (converted, suffix)
1120 def parse_filesize(s):
1124 # The lower-case forms are of course incorrect and inofficial,
1125 # but we support those too
1163 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1165 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1169 num_str = m.group('num').replace(',', '.')
1170 mult = _UNIT_TABLE[m.group('unit')]
1171 return int(float(num_str) * mult)
1174 def get_term_width():
1175 columns = compat_getenv('COLUMNS', None)
1180 sp = subprocess.Popen(
1182 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1183 out, err = sp.communicate()
1184 return int(out.split()[1])
1190 def month_by_name(name):
1191 """ Return the number of a month by (locale-independently) English name """
1194 'January', 'February', 'March', 'April', 'May', 'June',
1195 'July', 'August', 'September', 'October', 'November', 'December']
1197 return ENGLISH_NAMES.index(name) + 1
1202 def fix_xml_ampersands(xml_str):
1203 """Replace all the '&' by '&' in XML"""
1205 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1210 def setproctitle(title):
1211 assert isinstance(title, compat_str)
1213 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1216 title_bytes = title.encode('utf-8')
1217 buf = ctypes.create_string_buffer(len(title_bytes))
1218 buf.value = title_bytes
1220 libc.prctl(15, buf, 0, 0, 0)
1221 except AttributeError:
1222 return # Strange libc, just skip this
1225 def remove_start(s, start):
1226 if s.startswith(start):
1227 return s[len(start):]
1231 def remove_end(s, end):
1233 return s[:-len(end)]
1237 def url_basename(url):
1238 path = compat_urlparse.urlparse(url).path
1239 return path.strip('/').split('/')[-1]
1242 class HEADRequest(compat_urllib_request.Request):
1243 def get_method(self):
1247 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1250 v = getattr(v, get_attr, None)
1253 return default if v is None else (int(v) * invscale // scale)
1256 def str_or_none(v, default=None):
1257 return default if v is None else compat_str(v)
1260 def str_to_int(int_str):
1261 """ A more relaxed version of int_or_none """
1264 int_str = re.sub(r'[,\.\+]', '', int_str)
1268 def float_or_none(v, scale=1, invscale=1, default=None):
1269 return default if v is None else (float(v) * invscale / scale)
1272 def parse_duration(s):
1273 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1281 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1282 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1285 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1286 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1288 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1293 if m.group('only_mins'):
1294 return float_or_none(m.group('only_mins'), invscale=60)
1295 if m.group('only_hours'):
1296 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1298 res += int(m.group('secs'))
1300 res += int(m.group('mins')) * 60
1301 if m.group('hours'):
1302 res += int(m.group('hours')) * 60 * 60
1304 res += float(m.group('ms'))
1308 def prepend_extension(filename, ext):
1309 name, real_ext = os.path.splitext(filename)
1310 return '{0}.{1}{2}'.format(name, ext, real_ext)
1313 def check_executable(exe, args=[]):
1314 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1315 args can be a list of arguments for a short output (like -version) """
1317 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1323 def get_exe_version(exe, args=['--version'],
1324 version_re=None, unrecognized='present'):
1325 """ Returns the version of the specified executable,
1326 or False if the executable is not present """
1328 out, _ = subprocess.Popen(
1330 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1333 if isinstance(out, bytes): # Python 2.x
1334 out = out.decode('ascii', 'ignore')
1335 return detect_exe_version(out, version_re, unrecognized)
1338 def detect_exe_version(output, version_re=None, unrecognized='present'):
1339 assert isinstance(output, compat_str)
1340 if version_re is None:
1341 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1342 m = re.search(version_re, output)
1349 class PagedList(object):
1351 # This is only useful for tests
1352 return len(self.getslice())
1355 class OnDemandPagedList(PagedList):
1356 def __init__(self, pagefunc, pagesize):
1357 self._pagefunc = pagefunc
1358 self._pagesize = pagesize
1360 def getslice(self, start=0, end=None):
1362 for pagenum in itertools.count(start // self._pagesize):
1363 firstid = pagenum * self._pagesize
1364 nextfirstid = pagenum * self._pagesize + self._pagesize
1365 if start >= nextfirstid:
1368 page_results = list(self._pagefunc(pagenum))
1371 start % self._pagesize
1372 if firstid <= start < nextfirstid
1376 ((end - 1) % self._pagesize) + 1
1377 if (end is not None and firstid <= end <= nextfirstid)
1380 if startv != 0 or endv is not None:
1381 page_results = page_results[startv:endv]
1382 res.extend(page_results)
1384 # A little optimization - if current page is not "full", ie. does
1385 # not contain page_size videos then we can assume that this page
1386 # is the last one - there are no more ids on further pages -
1387 # i.e. no need to query again.
1388 if len(page_results) + startv < self._pagesize:
1391 # If we got the whole page, but the next page is not interesting,
1392 # break out early as well
1393 if end == nextfirstid:
1398 class InAdvancePagedList(PagedList):
1399 def __init__(self, pagefunc, pagecount, pagesize):
1400 self._pagefunc = pagefunc
1401 self._pagecount = pagecount
1402 self._pagesize = pagesize
1404 def getslice(self, start=0, end=None):
1406 start_page = start // self._pagesize
1408 self._pagecount if end is None else (end // self._pagesize + 1))
1409 skip_elems = start - start_page * self._pagesize
1410 only_more = None if end is None else end - start
1411 for pagenum in range(start_page, end_page):
1412 page = list(self._pagefunc(pagenum))
1414 page = page[skip_elems:]
1416 if only_more is not None:
1417 if len(page) < only_more:
1418 only_more -= len(page)
1420 page = page[:only_more]
1427 def uppercase_escape(s):
1428 unicode_escape = codecs.getdecoder('unicode_escape')
1430 r'\\U[0-9a-fA-F]{8}',
1431 lambda m: unicode_escape(m.group(0))[0],
1435 def escape_rfc3986(s):
1436 """Escape non-ASCII characters as suggested by RFC 3986"""
1437 if sys.version_info < (3, 0) and isinstance(s, unicode):
1438 s = s.encode('utf-8')
1439 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1442 def escape_url(url):
1443 """Escape URL as suggested by RFC 3986"""
1444 url_parsed = compat_urllib_parse_urlparse(url)
1445 return url_parsed._replace(
1446 path=escape_rfc3986(url_parsed.path),
1447 params=escape_rfc3986(url_parsed.params),
1448 query=escape_rfc3986(url_parsed.query),
1449 fragment=escape_rfc3986(url_parsed.fragment)
1453 struct.pack('!I', 0)
1455 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1456 def struct_pack(spec, *args):
1457 if isinstance(spec, compat_str):
1458 spec = spec.encode('ascii')
1459 return struct.pack(spec, *args)
1461 def struct_unpack(spec, *args):
1462 if isinstance(spec, compat_str):
1463 spec = spec.encode('ascii')
1464 return struct.unpack(spec, *args)
1466 struct_pack = struct.pack
1467 struct_unpack = struct.unpack
1470 def read_batch_urls(batch_fd):
1472 if not isinstance(url, compat_str):
1473 url = url.decode('utf-8', 'replace')
1474 BOM_UTF8 = '\xef\xbb\xbf'
1475 if url.startswith(BOM_UTF8):
1476 url = url[len(BOM_UTF8):]
1478 if url.startswith(('#', ';', ']')):
1482 with contextlib.closing(batch_fd) as fd:
1483 return [url for url in map(fixup, fd) if url]
1486 def urlencode_postdata(*args, **kargs):
1487 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1491 etree_iter = xml.etree.ElementTree.Element.iter
1492 except AttributeError: # Python <=2.6
1493 etree_iter = lambda n: n.findall('.//*')
1497 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1498 def doctype(self, name, pubid, system):
1499 pass # Ignore doctypes
1501 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1502 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1503 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1504 # Fix up XML parser in Python 2.x
1505 if sys.version_info < (3, 0):
1506 for n in etree_iter(tree):
1507 if n.text is not None:
1508 if not isinstance(n.text, compat_str):
1509 n.text = n.text.decode('utf-8')
1522 def parse_age_limit(s):
1525 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1526 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1529 def strip_jsonp(code):
1531 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1534 def js_to_json(code):
1537 if v in ('true', 'false', 'null'):
1539 if v.startswith('"'):
1541 if v.startswith("'"):
1543 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1550 res = re.sub(r'''(?x)
1551 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1552 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1553 [a-zA-Z_][a-zA-Z_0-9]*
1555 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1559 def qualities(quality_ids):
1560 """ Get a numeric quality value out of a list of possible values """
1563 return quality_ids.index(qid)
1569 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1572 def limit_length(s, length):
1573 """ Add ellipses to overly long strings """
1578 return s[:length - len(ELLIPSES)] + ELLIPSES
1582 def version_tuple(v):
1583 return tuple(int(e) for e in re.split(r'[-.]', v))
1586 def is_outdated_version(version, limit, assume_new=True):
1588 return not assume_new
1590 return version_tuple(version) < version_tuple(limit)
1592 return not assume_new
1595 def ytdl_is_updateable():
1596 """ Returns if youtube-dl can be updated with -U """
1597 from zipimport import zipimporter
1599 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1602 def args_to_str(args):
1603 # Get a short string representation for a subprocess command
1604 return ' '.join(shlex_quote(a) for a in args)
1607 def urlhandle_detect_ext(url_handle):
1610 getheader = lambda h: url_handle.headers[h]
1611 except AttributeError: # Python < 3
1612 getheader = url_handle.info().getheader
1614 return getheader('Content-Type').split("/")[1]
1617 def age_restricted(content_limit, age_limit):
1618 """ Returns True iff the content should be blocked """
1620 if age_limit is None: # No limit set
1622 if content_limit is None:
1623 return False # Content available for everyone
1624 return age_limit < content_limit