2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
320 mobj = re.match(r'#(x?[0-9]+)', entity)
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
325 numstr = '0%s' % numstr
328 return compat_chr(int(numstr, base))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
337 assert type(s) == compat_str
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343 def encodeFilename(s, for_subprocess=False):
345 @param s The name of the file
348 assert type(s) == compat_str
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
365 encoding = sys.getfilesystemencoding()
368 return s.encode(encoding, 'ignore')
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
380 def decodeOption(optval):
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
386 assert isinstance(optval, compat_str)
390 def formatSeconds(secs):
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 return '%d:%02d' % (secs // 60, secs % 60)
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
403 if opts_no_check_certificate:
404 context.verify_mode = ssl.CERT_NONE
406 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
409 # (create_default_context present but HTTPSHandler has no context=)
412 if sys.version_info < (3, 2):
415 class HTTPSConnectionV3(httplib.HTTPSConnection):
416 def __init__(self, *args, **kwargs):
417 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
420 sock = socket.create_connection((self.host, self.port), self.timeout)
421 if getattr(self, '_tunnel_host', False):
425 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
427 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
429 return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
431 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
432 context.verify_mode = (ssl.CERT_NONE
433 if opts_no_check_certificate
434 else ssl.CERT_REQUIRED)
435 context.set_default_verify_paths()
436 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
439 class ExtractorError(Exception):
440 """Error during info extraction."""
442 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
443 """ tb, if given, is the original traceback (so that it can be printed out).
444 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
447 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
449 if video_id is not None:
450 msg = video_id + ': ' + msg
452 msg += ' (caused by %r)' % cause
454 if ytdl_is_updateable():
455 update_cmd = 'type youtube-dl -U to update'
457 update_cmd = 'see https://yt-dl.org/update on how to update'
458 msg += '; please report this issue on https://yt-dl.org/bug .'
459 msg += ' Make sure you are using the latest version; %s.' % update_cmd
460 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
461 super(ExtractorError, self).__init__(msg)
464 self.exc_info = sys.exc_info() # preserve original exception
466 self.video_id = video_id
468 def format_traceback(self):
469 if self.traceback is None:
471 return ''.join(traceback.format_tb(self.traceback))
474 class UnsupportedError(ExtractorError):
475 def __init__(self, url):
476 super(UnsupportedError, self).__init__(
477 'Unsupported URL: %s' % url, expected=True)
481 class RegexNotFoundError(ExtractorError):
482 """Error when a regex didn't match"""
486 class DownloadError(Exception):
487 """Download Error exception.
489 This exception may be thrown by FileDownloader objects if they are not
490 configured to continue on errors. They will contain the appropriate
494 def __init__(self, msg, exc_info=None):
495 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
496 super(DownloadError, self).__init__(msg)
497 self.exc_info = exc_info
500 class SameFileError(Exception):
501 """Same File exception.
503 This exception will be thrown by FileDownloader objects if they detect
504 multiple files would have to be downloaded to the same file on disk.
509 class PostProcessingError(Exception):
510 """Post Processing exception.
512 This exception may be raised by PostProcessor's .run() method to
513 indicate an error in the postprocessing task.
516 def __init__(self, msg):
520 class MaxDownloadsReached(Exception):
521 """ --max-downloads limit has been reached. """
525 class UnavailableVideoError(Exception):
526 """Unavailable Format exception.
528 This exception will be thrown when a video is requested
529 in a format that is not available for that video.
534 class ContentTooShortError(Exception):
535 """Content Too Short exception.
537 This exception may be raised by FileDownloader objects when a file they
538 download is too small for what the server announced first, indicating
539 the connection was probably interrupted.
545 def __init__(self, downloaded, expected):
546 self.downloaded = downloaded
547 self.expected = expected
550 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
551 hc = http_class(*args, **kwargs)
552 source_address = ydl_handler._params.get('source_address')
553 if source_address is not None:
554 sa = (source_address, 0)
555 if hasattr(hc, 'source_address'): # Python 2.7+
556 hc.source_address = sa
558 def _hc_connect(self, *args, **kwargs):
559 sock = compat_socket_create_connection(
560 (self.host, self.port), self.timeout, sa)
562 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
565 hc.connect = functools.partial(_hc_connect, hc)
570 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
571 """Handler for HTTP requests and responses.
573 This class, when installed with an OpenerDirector, automatically adds
574 the standard headers to every HTTP request and handles gzipped and
575 deflated responses from web servers. If compression is to be avoided in
576 a particular request, the original request in the program code only has
577 to include the HTTP header "Youtubedl-No-Compression", which will be
578 removed before making the real request.
580 Part of this code was copied from:
582 http://techknack.net/python-urllib2-handlers/
584 Andrew Rowls, the author of that code, agreed to release it to the
588 def __init__(self, params, *args, **kwargs):
589 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
590 self._params = params
592 def http_open(self, req):
593 return self.do_open(functools.partial(
594 _create_http_connection, self, compat_http_client.HTTPConnection, False),
600 return zlib.decompress(data, -zlib.MAX_WBITS)
602 return zlib.decompress(data)
605 def addinfourl_wrapper(stream, headers, url, code):
606 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
607 return compat_urllib_request.addinfourl(stream, headers, url, code)
608 ret = compat_urllib_request.addinfourl(stream, headers, url)
612 def http_request(self, req):
613 for h, v in std_headers.items():
614 if h not in req.headers:
616 if 'Youtubedl-no-compression' in req.headers:
617 if 'Accept-encoding' in req.headers:
618 del req.headers['Accept-encoding']
619 del req.headers['Youtubedl-no-compression']
620 if 'Youtubedl-user-agent' in req.headers:
621 if 'User-agent' in req.headers:
622 del req.headers['User-agent']
623 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
624 del req.headers['Youtubedl-user-agent']
626 if sys.version_info < (2, 7) and '#' in req.get_full_url():
627 # Python 2.6 is brain-dead when it comes to fragments
628 req._Request__original = req._Request__original.partition('#')[0]
629 req._Request__r_type = req._Request__r_type.partition('#')[0]
633 def http_response(self, req, resp):
636 if resp.headers.get('Content-encoding', '') == 'gzip':
637 content = resp.read()
638 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
640 uncompressed = io.BytesIO(gz.read())
641 except IOError as original_ioerror:
642 # There may be junk add the end of the file
643 # See http://stackoverflow.com/q/4928560/35070 for details
644 for i in range(1, 1024):
646 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
647 uncompressed = io.BytesIO(gz.read())
652 raise original_ioerror
653 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
654 resp.msg = old_resp.msg
656 if resp.headers.get('Content-encoding', '') == 'deflate':
657 gz = io.BytesIO(self.deflate(resp.read()))
658 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
659 resp.msg = old_resp.msg
662 https_request = http_request
663 https_response = http_response
666 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
667 def __init__(self, params, https_conn_class=None, *args, **kwargs):
668 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
669 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
670 self._params = params
672 def https_open(self, req):
673 return self.do_open(functools.partial(
674 _create_http_connection, self, self._https_conn_class, True),
678 def parse_iso8601(date_str, delimiter='T'):
679 """ Return a UNIX timestamp from the given date """
685 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
688 timezone = datetime.timedelta()
690 date_str = date_str[:-len(m.group(0))]
691 if not m.group('sign'):
692 timezone = datetime.timedelta()
694 sign = 1 if m.group('sign') == '+' else -1
695 timezone = datetime.timedelta(
696 hours=sign * int(m.group('hours')),
697 minutes=sign * int(m.group('minutes')))
698 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
699 dt = datetime.datetime.strptime(date_str, date_format) - timezone
700 return calendar.timegm(dt.timetuple())
703 def unified_strdate(date_str, day_first=True):
704 """Return a string with the date in the format YYYYMMDD"""
710 date_str = date_str.replace(',', ' ')
711 # %z (UTC offset) is only supported in python>=3.2
712 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
713 # Remove AM/PM + timezone
714 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
716 format_expressions = [
721 '%b %dst %Y %I:%M%p',
722 '%b %dnd %Y %I:%M%p',
723 '%b %dth %Y %I:%M%p',
728 '%Y-%m-%d %H:%M:%S.%f',
731 '%Y-%m-%dT%H:%M:%SZ',
732 '%Y-%m-%dT%H:%M:%S.%fZ',
733 '%Y-%m-%dT%H:%M:%S.%f0Z',
735 '%Y-%m-%dT%H:%M:%S.%f',
739 format_expressions.extend([
746 format_expressions.extend([
752 for expression in format_expressions:
754 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
757 if upload_date is None:
758 timetuple = email.utils.parsedate_tz(date_str)
760 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
764 def determine_ext(url, default_ext='unknown_video'):
767 guess = url.partition('?')[0].rpartition('.')[2]
768 if re.match(r'^[A-Za-z0-9]+$', guess):
774 def subtitles_filename(filename, sub_lang, sub_format):
775 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
778 def date_from_str(date_str):
780 Return a datetime object from a string in the format YYYYMMDD or
781 (now|today)[+-][0-9](day|week|month|year)(s)?"""
782 today = datetime.date.today()
783 if date_str in ('now', 'today'):
785 if date_str == 'yesterday':
786 return today - datetime.timedelta(days=1)
787 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
788 if match is not None:
789 sign = match.group('sign')
790 time = int(match.group('time'))
793 unit = match.group('unit')
794 # A bad aproximation?
802 delta = datetime.timedelta(**{unit: time})
804 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
807 def hyphenate_date(date_str):
809 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
810 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
811 if match is not None:
812 return '-'.join(match.groups())
817 class DateRange(object):
818 """Represents a time interval between two dates"""
820 def __init__(self, start=None, end=None):
821 """start and end must be strings in the format accepted by date"""
822 if start is not None:
823 self.start = date_from_str(start)
825 self.start = datetime.datetime.min.date()
827 self.end = date_from_str(end)
829 self.end = datetime.datetime.max.date()
830 if self.start > self.end:
831 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
835 """Returns a range that only contains the given day"""
838 def __contains__(self, date):
839 """Check if the date is in the range"""
840 if not isinstance(date, datetime.date):
841 date = date_from_str(date)
842 return self.start <= date <= self.end
845 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
849 """ Returns the platform name as a compat_str """
850 res = platform.platform()
851 if isinstance(res, bytes):
852 res = res.decode(preferredencoding())
854 assert isinstance(res, compat_str)
858 def _windows_write_string(s, out):
859 """ Returns True if the string was written using special methods,
860 False if it has yet to be written out."""
861 # Adapted from http://stackoverflow.com/a/3259271/35070
864 import ctypes.wintypes
872 fileno = out.fileno()
873 except AttributeError:
874 # If the output stream doesn't have a fileno, it's virtual
876 if fileno not in WIN_OUTPUT_IDS:
879 GetStdHandle = ctypes.WINFUNCTYPE(
880 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
881 (b"GetStdHandle", ctypes.windll.kernel32))
882 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
884 WriteConsoleW = ctypes.WINFUNCTYPE(
885 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
886 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
887 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
888 written = ctypes.wintypes.DWORD(0)
890 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
891 FILE_TYPE_CHAR = 0x0002
892 FILE_TYPE_REMOTE = 0x8000
893 GetConsoleMode = ctypes.WINFUNCTYPE(
894 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
895 ctypes.POINTER(ctypes.wintypes.DWORD))(
896 (b"GetConsoleMode", ctypes.windll.kernel32))
897 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
899 def not_a_console(handle):
900 if handle == INVALID_HANDLE_VALUE or handle is None:
902 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
903 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
908 def next_nonbmp_pos(s):
910 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
911 except StopIteration:
915 count = min(next_nonbmp_pos(s), 1024)
918 h, s, count if count else 2, ctypes.byref(written), None)
920 raise OSError('Failed to write string')
921 if not count: # We just wrote a non-BMP character
922 assert written.value == 2
925 assert written.value > 0
926 s = s[written.value:]
930 def write_string(s, out=None, encoding=None):
933 assert type(s) == compat_str
935 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
936 if _windows_write_string(s, out):
939 if ('b' in getattr(out, 'mode', '') or
940 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
941 byt = s.encode(encoding or preferredencoding(), 'ignore')
943 elif hasattr(out, 'buffer'):
944 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
945 byt = s.encode(enc, 'ignore')
946 out.buffer.write(byt)
952 def bytes_to_intlist(bs):
955 if isinstance(bs[0], int): # Python 3
958 return [ord(c) for c in bs]
961 def intlist_to_bytes(xs):
964 return struct_pack('%dB' % len(xs), *xs)
967 # Cross-platform file locking
968 if sys.platform == 'win32':
969 import ctypes.wintypes
972 class OVERLAPPED(ctypes.Structure):
974 ('Internal', ctypes.wintypes.LPVOID),
975 ('InternalHigh', ctypes.wintypes.LPVOID),
976 ('Offset', ctypes.wintypes.DWORD),
977 ('OffsetHigh', ctypes.wintypes.DWORD),
978 ('hEvent', ctypes.wintypes.HANDLE),
981 kernel32 = ctypes.windll.kernel32
982 LockFileEx = kernel32.LockFileEx
983 LockFileEx.argtypes = [
984 ctypes.wintypes.HANDLE, # hFile
985 ctypes.wintypes.DWORD, # dwFlags
986 ctypes.wintypes.DWORD, # dwReserved
987 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
989 ctypes.POINTER(OVERLAPPED) # Overlapped
991 LockFileEx.restype = ctypes.wintypes.BOOL
992 UnlockFileEx = kernel32.UnlockFileEx
993 UnlockFileEx.argtypes = [
994 ctypes.wintypes.HANDLE, # hFile
995 ctypes.wintypes.DWORD, # dwReserved
996 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
998 ctypes.POINTER(OVERLAPPED) # Overlapped
1000 UnlockFileEx.restype = ctypes.wintypes.BOOL
1001 whole_low = 0xffffffff
1002 whole_high = 0x7fffffff
1004 def _lock_file(f, exclusive):
1005 overlapped = OVERLAPPED()
1006 overlapped.Offset = 0
1007 overlapped.OffsetHigh = 0
1008 overlapped.hEvent = 0
1009 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1010 handle = msvcrt.get_osfhandle(f.fileno())
1011 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1012 whole_low, whole_high, f._lock_file_overlapped_p):
1013 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1015 def _unlock_file(f):
1016 assert f._lock_file_overlapped_p
1017 handle = msvcrt.get_osfhandle(f.fileno())
1018 if not UnlockFileEx(handle, 0,
1019 whole_low, whole_high, f._lock_file_overlapped_p):
1020 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1025 def _lock_file(f, exclusive):
1026 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1028 def _unlock_file(f):
1029 fcntl.flock(f, fcntl.LOCK_UN)
1032 class locked_file(object):
1033 def __init__(self, filename, mode, encoding=None):
1034 assert mode in ['r', 'a', 'w']
1035 self.f = io.open(filename, mode, encoding=encoding)
1038 def __enter__(self):
1039 exclusive = self.mode != 'r'
1041 _lock_file(self.f, exclusive)
1047 def __exit__(self, etype, value, traceback):
1049 _unlock_file(self.f)
1056 def write(self, *args):
1057 return self.f.write(*args)
1059 def read(self, *args):
1060 return self.f.read(*args)
1063 def get_filesystem_encoding():
1064 encoding = sys.getfilesystemencoding()
1065 return encoding if encoding is not None else 'utf-8'
1068 def shell_quote(args):
1070 encoding = get_filesystem_encoding()
1072 if isinstance(a, bytes):
1073 # We may get a filename encoded with 'encodeFilename'
1074 a = a.decode(encoding)
1075 quoted_args.append(pipes.quote(a))
1076 return ' '.join(quoted_args)
1079 def takewhile_inclusive(pred, seq):
1080 """ Like itertools.takewhile, but include the latest evaluated element
1081 (the first element so that Not pred(e)) """
1088 def smuggle_url(url, data):
1089 """ Pass additional data in a URL for internal use. """
1091 sdata = compat_urllib_parse.urlencode(
1092 {'__youtubedl_smuggle': json.dumps(data)})
1093 return url + '#' + sdata
1096 def unsmuggle_url(smug_url, default=None):
1097 if '#__youtubedl_smuggle' not in smug_url:
1098 return smug_url, default
1099 url, _, sdata = smug_url.rpartition('#')
1100 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1101 data = json.loads(jsond)
1105 def format_bytes(bytes):
1108 if type(bytes) is str:
1109 bytes = float(bytes)
1113 exponent = int(math.log(bytes, 1024.0))
1114 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1115 converted = float(bytes) / float(1024 ** exponent)
1116 return '%.2f%s' % (converted, suffix)
1119 def parse_filesize(s):
1123 # The lower-case forms are of course incorrect and inofficial,
1124 # but we support those too
1162 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1164 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1168 num_str = m.group('num').replace(',', '.')
1169 mult = _UNIT_TABLE[m.group('unit')]
1170 return int(float(num_str) * mult)
1173 def get_term_width():
1174 columns = compat_getenv('COLUMNS', None)
1179 sp = subprocess.Popen(
1181 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1182 out, err = sp.communicate()
1183 return int(out.split()[1])
1189 def month_by_name(name):
1190 """ Return the number of a month by (locale-independently) English name """
1193 'January', 'February', 'March', 'April', 'May', 'June',
1194 'July', 'August', 'September', 'October', 'November', 'December']
1196 return ENGLISH_NAMES.index(name) + 1
1201 def fix_xml_ampersands(xml_str):
1202 """Replace all the '&' by '&' in XML"""
1204 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1209 def setproctitle(title):
1210 assert isinstance(title, compat_str)
1212 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1215 title_bytes = title.encode('utf-8')
1216 buf = ctypes.create_string_buffer(len(title_bytes))
1217 buf.value = title_bytes
1219 libc.prctl(15, buf, 0, 0, 0)
1220 except AttributeError:
1221 return # Strange libc, just skip this
1224 def remove_start(s, start):
1225 if s.startswith(start):
1226 return s[len(start):]
1230 def remove_end(s, end):
1232 return s[:-len(end)]
1236 def url_basename(url):
1237 path = compat_urlparse.urlparse(url).path
1238 return path.strip('/').split('/')[-1]
1241 class HEADRequest(compat_urllib_request.Request):
1242 def get_method(self):
1246 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1249 v = getattr(v, get_attr, None)
1252 return default if v is None else (int(v) * invscale // scale)
1255 def str_or_none(v, default=None):
1256 return default if v is None else compat_str(v)
1259 def str_to_int(int_str):
1260 """ A more relaxed version of int_or_none """
1263 int_str = re.sub(r'[,\.\+]', '', int_str)
1267 def float_or_none(v, scale=1, invscale=1, default=None):
1268 return default if v is None else (float(v) * invscale / scale)
1271 def parse_duration(s):
1272 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1280 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1281 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1284 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1285 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1287 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1292 if m.group('only_mins'):
1293 return float_or_none(m.group('only_mins'), invscale=60)
1294 if m.group('only_hours'):
1295 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1297 res += int(m.group('secs'))
1299 res += int(m.group('mins')) * 60
1300 if m.group('hours'):
1301 res += int(m.group('hours')) * 60 * 60
1303 res += float(m.group('ms'))
1307 def prepend_extension(filename, ext):
1308 name, real_ext = os.path.splitext(filename)
1309 return '{0}.{1}{2}'.format(name, ext, real_ext)
1312 def check_executable(exe, args=[]):
1313 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1314 args can be a list of arguments for a short output (like -version) """
1316 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1322 def get_exe_version(exe, args=['--version'],
1323 version_re=None, unrecognized='present'):
1324 """ Returns the version of the specified executable,
1325 or False if the executable is not present """
1327 out, _ = subprocess.Popen(
1329 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1332 if isinstance(out, bytes): # Python 2.x
1333 out = out.decode('ascii', 'ignore')
1334 return detect_exe_version(out, version_re, unrecognized)
1337 def detect_exe_version(output, version_re=None, unrecognized='present'):
1338 assert isinstance(output, compat_str)
1339 if version_re is None:
1340 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1341 m = re.search(version_re, output)
1348 class PagedList(object):
1350 # This is only useful for tests
1351 return len(self.getslice())
1354 class OnDemandPagedList(PagedList):
1355 def __init__(self, pagefunc, pagesize):
1356 self._pagefunc = pagefunc
1357 self._pagesize = pagesize
1359 def getslice(self, start=0, end=None):
1361 for pagenum in itertools.count(start // self._pagesize):
1362 firstid = pagenum * self._pagesize
1363 nextfirstid = pagenum * self._pagesize + self._pagesize
1364 if start >= nextfirstid:
1367 page_results = list(self._pagefunc(pagenum))
1370 start % self._pagesize
1371 if firstid <= start < nextfirstid
1375 ((end - 1) % self._pagesize) + 1
1376 if (end is not None and firstid <= end <= nextfirstid)
1379 if startv != 0 or endv is not None:
1380 page_results = page_results[startv:endv]
1381 res.extend(page_results)
1383 # A little optimization - if current page is not "full", ie. does
1384 # not contain page_size videos then we can assume that this page
1385 # is the last one - there are no more ids on further pages -
1386 # i.e. no need to query again.
1387 if len(page_results) + startv < self._pagesize:
1390 # If we got the whole page, but the next page is not interesting,
1391 # break out early as well
1392 if end == nextfirstid:
1397 class InAdvancePagedList(PagedList):
1398 def __init__(self, pagefunc, pagecount, pagesize):
1399 self._pagefunc = pagefunc
1400 self._pagecount = pagecount
1401 self._pagesize = pagesize
1403 def getslice(self, start=0, end=None):
1405 start_page = start // self._pagesize
1407 self._pagecount if end is None else (end // self._pagesize + 1))
1408 skip_elems = start - start_page * self._pagesize
1409 only_more = None if end is None else end - start
1410 for pagenum in range(start_page, end_page):
1411 page = list(self._pagefunc(pagenum))
1413 page = page[skip_elems:]
1415 if only_more is not None:
1416 if len(page) < only_more:
1417 only_more -= len(page)
1419 page = page[:only_more]
1426 def uppercase_escape(s):
1427 unicode_escape = codecs.getdecoder('unicode_escape')
1429 r'\\U[0-9a-fA-F]{8}',
1430 lambda m: unicode_escape(m.group(0))[0],
1434 def escape_rfc3986(s):
1435 """Escape non-ASCII characters as suggested by RFC 3986"""
1436 if sys.version_info < (3, 0) and isinstance(s, unicode):
1437 s = s.encode('utf-8')
1438 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1441 def escape_url(url):
1442 """Escape URL as suggested by RFC 3986"""
1443 url_parsed = compat_urllib_parse_urlparse(url)
1444 return url_parsed._replace(
1445 path=escape_rfc3986(url_parsed.path),
1446 params=escape_rfc3986(url_parsed.params),
1447 query=escape_rfc3986(url_parsed.query),
1448 fragment=escape_rfc3986(url_parsed.fragment)
1452 struct.pack('!I', 0)
1454 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1455 def struct_pack(spec, *args):
1456 if isinstance(spec, compat_str):
1457 spec = spec.encode('ascii')
1458 return struct.pack(spec, *args)
1460 def struct_unpack(spec, *args):
1461 if isinstance(spec, compat_str):
1462 spec = spec.encode('ascii')
1463 return struct.unpack(spec, *args)
1465 struct_pack = struct.pack
1466 struct_unpack = struct.unpack
1469 def read_batch_urls(batch_fd):
1471 if not isinstance(url, compat_str):
1472 url = url.decode('utf-8', 'replace')
1473 BOM_UTF8 = '\xef\xbb\xbf'
1474 if url.startswith(BOM_UTF8):
1475 url = url[len(BOM_UTF8):]
1477 if url.startswith(('#', ';', ']')):
1481 with contextlib.closing(batch_fd) as fd:
1482 return [url for url in map(fixup, fd) if url]
1485 def urlencode_postdata(*args, **kargs):
1486 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1490 etree_iter = xml.etree.ElementTree.Element.iter
1491 except AttributeError: # Python <=2.6
1492 etree_iter = lambda n: n.findall('.//*')
1496 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1497 def doctype(self, name, pubid, system):
1498 pass # Ignore doctypes
1500 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1501 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1502 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1503 # Fix up XML parser in Python 2.x
1504 if sys.version_info < (3, 0):
1505 for n in etree_iter(tree):
1506 if n.text is not None:
1507 if not isinstance(n.text, compat_str):
1508 n.text = n.text.decode('utf-8')
1521 def parse_age_limit(s):
1524 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1525 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1528 def strip_jsonp(code):
1530 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1533 def js_to_json(code):
1536 if v in ('true', 'false', 'null'):
1538 if v.startswith('"'):
1540 if v.startswith("'"):
1542 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1549 res = re.sub(r'''(?x)
1550 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1551 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1552 [a-zA-Z_][a-zA-Z_0-9]*
1554 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1558 def qualities(quality_ids):
1559 """ Get a numeric quality value out of a list of possible values """
1562 return quality_ids.index(qid)
1568 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1571 def limit_length(s, length):
1572 """ Add ellipses to overly long strings """
1577 return s[:length - len(ELLIPSES)] + ELLIPSES
1581 def version_tuple(v):
1582 return tuple(int(e) for e in re.split(r'[-.]', v))
1585 def is_outdated_version(version, limit, assume_new=True):
1587 return not assume_new
1589 return version_tuple(version) < version_tuple(limit)
1591 return not assume_new
1594 def ytdl_is_updateable():
1595 """ Returns if youtube-dl can be updated with -U """
1596 from zipimport import zipimporter
1598 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1601 def args_to_str(args):
1602 # Get a short string representation for a subprocess command
1603 return ' '.join(shlex_quote(a) for a in args)
1606 def urlhandle_detect_ext(url_handle):
1609 getheader = lambda h: url_handle.headers[h]
1610 except AttributeError: # Python < 3
1611 getheader = url_handle.info().getheader
1613 return getheader('Content-Type').split("/")[1]
1616 def age_restricted(content_limit, age_limit):
1617 """ Returns True iff the content should be blocked """
1619 if age_limit is None: # No limit set
1621 if content_limit is None:
1622 return False # Content available for everyone
1623 return age_limit < content_limit