2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
320 mobj = re.match(r'#(x?[0-9]+)', entity)
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
325 numstr = '0%s' % numstr
328 return compat_chr(int(numstr, base))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
337 assert type(s) == compat_str
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343 def encodeFilename(s, for_subprocess=False):
345 @param s The name of the file
348 assert type(s) == compat_str
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
365 encoding = sys.getfilesystemencoding()
368 return s.encode(encoding, 'ignore')
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
380 def decodeOption(optval):
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
386 assert isinstance(optval, compat_str)
390 def formatSeconds(secs):
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 return '%d:%02d' % (secs // 60, secs % 60)
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
403 if opts_no_check_certificate:
404 context.check_hostname = False
405 context.verify_mode = ssl.CERT_NONE
407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
410 # (create_default_context present but HTTPSHandler has no context=)
413 if sys.version_info < (3, 2):
414 return YoutubeDLHTTPSHandler(params, **kwargs)
416 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
417 context.verify_mode = (ssl.CERT_NONE
418 if opts_no_check_certificate
419 else ssl.CERT_REQUIRED)
420 context.set_default_verify_paths()
421 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
424 class ExtractorError(Exception):
425 """Error during info extraction."""
427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
434 if video_id is not None:
435 msg = video_id + ': ' + msg
437 msg += ' (caused by %r)' % cause
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
446 super(ExtractorError, self).__init__(msg)
449 self.exc_info = sys.exc_info() # preserve original exception
451 self.video_id = video_id
453 def format_traceback(self):
454 if self.traceback is None:
456 return ''.join(traceback.format_tb(self.traceback))
459 class UnsupportedError(ExtractorError):
460 def __init__(self, url):
461 super(UnsupportedError, self).__init__(
462 'Unsupported URL: %s' % url, expected=True)
466 class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
471 class DownloadError(Exception):
472 """Download Error exception.
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
485 class SameFileError(Exception):
486 """Same File exception.
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
494 class PostProcessingError(Exception):
495 """Post Processing exception.
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
501 def __init__(self, msg):
505 class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
510 class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
519 class ContentTooShortError(Exception):
520 """Content Too Short exception.
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
535 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
536 hc = http_class(*args, **kwargs)
537 source_address = ydl_handler._params.get('source_address')
538 if source_address is not None:
539 sa = (source_address, 0)
540 if hasattr(hc, 'source_address'): # Python 2.7+
541 hc.source_address = sa
543 def _hc_connect(self, *args, **kwargs):
544 sock = compat_socket_create_connection(
545 (self.host, self.port), self.timeout, sa)
547 self.sock = ssl.wrap_socket(
548 sock, self.key_file, self.cert_file,
549 ssl_version=ssl.PROTOCOL_TLSv1)
552 hc.connect = functools.partial(_hc_connect, hc)
557 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
558 """Handler for HTTP requests and responses.
560 This class, when installed with an OpenerDirector, automatically adds
561 the standard headers to every HTTP request and handles gzipped and
562 deflated responses from web servers. If compression is to be avoided in
563 a particular request, the original request in the program code only has
564 to include the HTTP header "Youtubedl-No-Compression", which will be
565 removed before making the real request.
567 Part of this code was copied from:
569 http://techknack.net/python-urllib2-handlers/
571 Andrew Rowls, the author of that code, agreed to release it to the
575 def __init__(self, params, *args, **kwargs):
576 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
577 self._params = params
579 def http_open(self, req):
580 return self.do_open(functools.partial(
581 _create_http_connection, self, compat_http_client.HTTPConnection, False),
587 return zlib.decompress(data, -zlib.MAX_WBITS)
589 return zlib.decompress(data)
592 def addinfourl_wrapper(stream, headers, url, code):
593 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
594 return compat_urllib_request.addinfourl(stream, headers, url, code)
595 ret = compat_urllib_request.addinfourl(stream, headers, url)
599 def http_request(self, req):
600 for h, v in std_headers.items():
601 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
602 # The dict keys are capitalized because of this bug by urllib
603 if h.capitalize() not in req.headers:
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
610 if sys.version_info < (2, 7) and '#' in req.get_full_url():
611 # Python 2.6 is brain-dead when it comes to fragments
612 req._Request__original = req._Request__original.partition('#')[0]
613 req._Request__r_type = req._Request__r_type.partition('#')[0]
617 def http_response(self, req, resp):
620 if resp.headers.get('Content-encoding', '') == 'gzip':
621 content = resp.read()
622 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
624 uncompressed = io.BytesIO(gz.read())
625 except IOError as original_ioerror:
626 # There may be junk add the end of the file
627 # See http://stackoverflow.com/q/4928560/35070 for details
628 for i in range(1, 1024):
630 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
631 uncompressed = io.BytesIO(gz.read())
636 raise original_ioerror
637 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
638 resp.msg = old_resp.msg
640 if resp.headers.get('Content-encoding', '') == 'deflate':
641 gz = io.BytesIO(self.deflate(resp.read()))
642 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
643 resp.msg = old_resp.msg
646 https_request = http_request
647 https_response = http_response
650 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
651 def __init__(self, params, https_conn_class=None, *args, **kwargs):
652 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
653 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
654 self._params = params
656 def https_open(self, req):
658 if hasattr(self, '_context'): # python > 2.6
659 kwargs['context'] = self._context
660 if hasattr(self, '_check_hostname'): # python 3.x
661 kwargs['check_hostname'] = self._check_hostname
662 return self.do_open(functools.partial(
663 _create_http_connection, self, self._https_conn_class, True),
667 def parse_iso8601(date_str, delimiter='T'):
668 """ Return a UNIX timestamp from the given date """
674 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
677 timezone = datetime.timedelta()
679 date_str = date_str[:-len(m.group(0))]
680 if not m.group('sign'):
681 timezone = datetime.timedelta()
683 sign = 1 if m.group('sign') == '+' else -1
684 timezone = datetime.timedelta(
685 hours=sign * int(m.group('hours')),
686 minutes=sign * int(m.group('minutes')))
687 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
688 dt = datetime.datetime.strptime(date_str, date_format) - timezone
689 return calendar.timegm(dt.timetuple())
692 def unified_strdate(date_str, day_first=True):
693 """Return a string with the date in the format YYYYMMDD"""
699 date_str = date_str.replace(',', ' ')
700 # %z (UTC offset) is only supported in python>=3.2
701 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
702 # Remove AM/PM + timezone
703 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
705 format_expressions = [
710 '%b %dst %Y %I:%M%p',
711 '%b %dnd %Y %I:%M%p',
712 '%b %dth %Y %I:%M%p',
718 '%Y-%m-%d %H:%M:%S.%f',
721 '%Y-%m-%dT%H:%M:%SZ',
722 '%Y-%m-%dT%H:%M:%S.%fZ',
723 '%Y-%m-%dT%H:%M:%S.%f0Z',
725 '%Y-%m-%dT%H:%M:%S.%f',
729 format_expressions.extend([
736 format_expressions.extend([
742 for expression in format_expressions:
744 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
747 if upload_date is None:
748 timetuple = email.utils.parsedate_tz(date_str)
750 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
754 def determine_ext(url, default_ext='unknown_video'):
757 guess = url.partition('?')[0].rpartition('.')[2]
758 if re.match(r'^[A-Za-z0-9]+$', guess):
764 def subtitles_filename(filename, sub_lang, sub_format):
765 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
768 def date_from_str(date_str):
770 Return a datetime object from a string in the format YYYYMMDD or
771 (now|today)[+-][0-9](day|week|month|year)(s)?"""
772 today = datetime.date.today()
773 if date_str in ('now', 'today'):
775 if date_str == 'yesterday':
776 return today - datetime.timedelta(days=1)
777 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
778 if match is not None:
779 sign = match.group('sign')
780 time = int(match.group('time'))
783 unit = match.group('unit')
784 # A bad aproximation?
792 delta = datetime.timedelta(**{unit: time})
794 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
797 def hyphenate_date(date_str):
799 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
800 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
801 if match is not None:
802 return '-'.join(match.groups())
807 class DateRange(object):
808 """Represents a time interval between two dates"""
810 def __init__(self, start=None, end=None):
811 """start and end must be strings in the format accepted by date"""
812 if start is not None:
813 self.start = date_from_str(start)
815 self.start = datetime.datetime.min.date()
817 self.end = date_from_str(end)
819 self.end = datetime.datetime.max.date()
820 if self.start > self.end:
821 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
825 """Returns a range that only contains the given day"""
828 def __contains__(self, date):
829 """Check if the date is in the range"""
830 if not isinstance(date, datetime.date):
831 date = date_from_str(date)
832 return self.start <= date <= self.end
835 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
839 """ Returns the platform name as a compat_str """
840 res = platform.platform()
841 if isinstance(res, bytes):
842 res = res.decode(preferredencoding())
844 assert isinstance(res, compat_str)
848 def _windows_write_string(s, out):
849 """ Returns True if the string was written using special methods,
850 False if it has yet to be written out."""
851 # Adapted from http://stackoverflow.com/a/3259271/35070
854 import ctypes.wintypes
862 fileno = out.fileno()
863 except AttributeError:
864 # If the output stream doesn't have a fileno, it's virtual
866 except io.UnsupportedOperation:
867 # Some strange Windows pseudo files?
869 if fileno not in WIN_OUTPUT_IDS:
872 GetStdHandle = ctypes.WINFUNCTYPE(
873 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
874 (b"GetStdHandle", ctypes.windll.kernel32))
875 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
877 WriteConsoleW = ctypes.WINFUNCTYPE(
878 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
879 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
880 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
881 written = ctypes.wintypes.DWORD(0)
883 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
884 FILE_TYPE_CHAR = 0x0002
885 FILE_TYPE_REMOTE = 0x8000
886 GetConsoleMode = ctypes.WINFUNCTYPE(
887 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
888 ctypes.POINTER(ctypes.wintypes.DWORD))(
889 (b"GetConsoleMode", ctypes.windll.kernel32))
890 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
892 def not_a_console(handle):
893 if handle == INVALID_HANDLE_VALUE or handle is None:
895 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
896 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
901 def next_nonbmp_pos(s):
903 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
904 except StopIteration:
908 count = min(next_nonbmp_pos(s), 1024)
911 h, s, count if count else 2, ctypes.byref(written), None)
913 raise OSError('Failed to write string')
914 if not count: # We just wrote a non-BMP character
915 assert written.value == 2
918 assert written.value > 0
919 s = s[written.value:]
923 def write_string(s, out=None, encoding=None):
926 assert type(s) == compat_str
928 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
929 if _windows_write_string(s, out):
932 if ('b' in getattr(out, 'mode', '') or
933 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
934 byt = s.encode(encoding or preferredencoding(), 'ignore')
936 elif hasattr(out, 'buffer'):
937 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
938 byt = s.encode(enc, 'ignore')
939 out.buffer.write(byt)
945 def bytes_to_intlist(bs):
948 if isinstance(bs[0], int): # Python 3
951 return [ord(c) for c in bs]
954 def intlist_to_bytes(xs):
957 return struct_pack('%dB' % len(xs), *xs)
960 # Cross-platform file locking
961 if sys.platform == 'win32':
962 import ctypes.wintypes
965 class OVERLAPPED(ctypes.Structure):
967 ('Internal', ctypes.wintypes.LPVOID),
968 ('InternalHigh', ctypes.wintypes.LPVOID),
969 ('Offset', ctypes.wintypes.DWORD),
970 ('OffsetHigh', ctypes.wintypes.DWORD),
971 ('hEvent', ctypes.wintypes.HANDLE),
974 kernel32 = ctypes.windll.kernel32
975 LockFileEx = kernel32.LockFileEx
976 LockFileEx.argtypes = [
977 ctypes.wintypes.HANDLE, # hFile
978 ctypes.wintypes.DWORD, # dwFlags
979 ctypes.wintypes.DWORD, # dwReserved
980 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
981 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
982 ctypes.POINTER(OVERLAPPED) # Overlapped
984 LockFileEx.restype = ctypes.wintypes.BOOL
985 UnlockFileEx = kernel32.UnlockFileEx
986 UnlockFileEx.argtypes = [
987 ctypes.wintypes.HANDLE, # hFile
988 ctypes.wintypes.DWORD, # dwReserved
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
991 ctypes.POINTER(OVERLAPPED) # Overlapped
993 UnlockFileEx.restype = ctypes.wintypes.BOOL
994 whole_low = 0xffffffff
995 whole_high = 0x7fffffff
997 def _lock_file(f, exclusive):
998 overlapped = OVERLAPPED()
999 overlapped.Offset = 0
1000 overlapped.OffsetHigh = 0
1001 overlapped.hEvent = 0
1002 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1003 handle = msvcrt.get_osfhandle(f.fileno())
1004 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1005 whole_low, whole_high, f._lock_file_overlapped_p):
1006 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1008 def _unlock_file(f):
1009 assert f._lock_file_overlapped_p
1010 handle = msvcrt.get_osfhandle(f.fileno())
1011 if not UnlockFileEx(handle, 0,
1012 whole_low, whole_high, f._lock_file_overlapped_p):
1013 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1018 def _lock_file(f, exclusive):
1019 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1021 def _unlock_file(f):
1022 fcntl.flock(f, fcntl.LOCK_UN)
1025 class locked_file(object):
1026 def __init__(self, filename, mode, encoding=None):
1027 assert mode in ['r', 'a', 'w']
1028 self.f = io.open(filename, mode, encoding=encoding)
1031 def __enter__(self):
1032 exclusive = self.mode != 'r'
1034 _lock_file(self.f, exclusive)
1040 def __exit__(self, etype, value, traceback):
1042 _unlock_file(self.f)
1049 def write(self, *args):
1050 return self.f.write(*args)
1052 def read(self, *args):
1053 return self.f.read(*args)
1056 def get_filesystem_encoding():
1057 encoding = sys.getfilesystemencoding()
1058 return encoding if encoding is not None else 'utf-8'
1061 def shell_quote(args):
1063 encoding = get_filesystem_encoding()
1065 if isinstance(a, bytes):
1066 # We may get a filename encoded with 'encodeFilename'
1067 a = a.decode(encoding)
1068 quoted_args.append(pipes.quote(a))
1069 return ' '.join(quoted_args)
1072 def takewhile_inclusive(pred, seq):
1073 """ Like itertools.takewhile, but include the latest evaluated element
1074 (the first element so that Not pred(e)) """
1081 def smuggle_url(url, data):
1082 """ Pass additional data in a URL for internal use. """
1084 sdata = compat_urllib_parse.urlencode(
1085 {'__youtubedl_smuggle': json.dumps(data)})
1086 return url + '#' + sdata
1089 def unsmuggle_url(smug_url, default=None):
1090 if '#__youtubedl_smuggle' not in smug_url:
1091 return smug_url, default
1092 url, _, sdata = smug_url.rpartition('#')
1093 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1094 data = json.loads(jsond)
1098 def format_bytes(bytes):
1101 if type(bytes) is str:
1102 bytes = float(bytes)
1106 exponent = int(math.log(bytes, 1024.0))
1107 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1108 converted = float(bytes) / float(1024 ** exponent)
1109 return '%.2f%s' % (converted, suffix)
1112 def parse_filesize(s):
1116 # The lower-case forms are of course incorrect and inofficial,
1117 # but we support those too
1155 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1157 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1161 num_str = m.group('num').replace(',', '.')
1162 mult = _UNIT_TABLE[m.group('unit')]
1163 return int(float(num_str) * mult)
1166 def get_term_width():
1167 columns = compat_getenv('COLUMNS', None)
1172 sp = subprocess.Popen(
1174 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1175 out, err = sp.communicate()
1176 return int(out.split()[1])
1182 def month_by_name(name):
1183 """ Return the number of a month by (locale-independently) English name """
1186 'January', 'February', 'March', 'April', 'May', 'June',
1187 'July', 'August', 'September', 'October', 'November', 'December']
1189 return ENGLISH_NAMES.index(name) + 1
1194 def fix_xml_ampersands(xml_str):
1195 """Replace all the '&' by '&' in XML"""
1197 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1202 def setproctitle(title):
1203 assert isinstance(title, compat_str)
1205 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1208 title_bytes = title.encode('utf-8')
1209 buf = ctypes.create_string_buffer(len(title_bytes))
1210 buf.value = title_bytes
1212 libc.prctl(15, buf, 0, 0, 0)
1213 except AttributeError:
1214 return # Strange libc, just skip this
1217 def remove_start(s, start):
1218 if s.startswith(start):
1219 return s[len(start):]
1223 def remove_end(s, end):
1225 return s[:-len(end)]
1229 def url_basename(url):
1230 path = compat_urlparse.urlparse(url).path
1231 return path.strip('/').split('/')[-1]
1234 class HEADRequest(compat_urllib_request.Request):
1235 def get_method(self):
1239 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1242 v = getattr(v, get_attr, None)
1245 return default if v is None else (int(v) * invscale // scale)
1248 def str_or_none(v, default=None):
1249 return default if v is None else compat_str(v)
1252 def str_to_int(int_str):
1253 """ A more relaxed version of int_or_none """
1256 int_str = re.sub(r'[,\.\+]', '', int_str)
1260 def float_or_none(v, scale=1, invscale=1, default=None):
1261 return default if v is None else (float(v) * invscale / scale)
1264 def parse_duration(s):
1265 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1273 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1274 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1277 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1278 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1280 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1285 if m.group('only_mins'):
1286 return float_or_none(m.group('only_mins'), invscale=60)
1287 if m.group('only_hours'):
1288 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1290 res += int(m.group('secs'))
1292 res += int(m.group('mins')) * 60
1293 if m.group('hours'):
1294 res += int(m.group('hours')) * 60 * 60
1296 res += float(m.group('ms'))
1300 def prepend_extension(filename, ext):
1301 name, real_ext = os.path.splitext(filename)
1302 return '{0}.{1}{2}'.format(name, ext, real_ext)
1305 def check_executable(exe, args=[]):
1306 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1307 args can be a list of arguments for a short output (like -version) """
1309 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1315 def get_exe_version(exe, args=['--version'],
1316 version_re=None, unrecognized='present'):
1317 """ Returns the version of the specified executable,
1318 or False if the executable is not present """
1320 out, _ = subprocess.Popen(
1322 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1325 if isinstance(out, bytes): # Python 2.x
1326 out = out.decode('ascii', 'ignore')
1327 return detect_exe_version(out, version_re, unrecognized)
1330 def detect_exe_version(output, version_re=None, unrecognized='present'):
1331 assert isinstance(output, compat_str)
1332 if version_re is None:
1333 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1334 m = re.search(version_re, output)
1341 class PagedList(object):
1343 # This is only useful for tests
1344 return len(self.getslice())
1347 class OnDemandPagedList(PagedList):
1348 def __init__(self, pagefunc, pagesize):
1349 self._pagefunc = pagefunc
1350 self._pagesize = pagesize
1352 def getslice(self, start=0, end=None):
1354 for pagenum in itertools.count(start // self._pagesize):
1355 firstid = pagenum * self._pagesize
1356 nextfirstid = pagenum * self._pagesize + self._pagesize
1357 if start >= nextfirstid:
1360 page_results = list(self._pagefunc(pagenum))
1363 start % self._pagesize
1364 if firstid <= start < nextfirstid
1368 ((end - 1) % self._pagesize) + 1
1369 if (end is not None and firstid <= end <= nextfirstid)
1372 if startv != 0 or endv is not None:
1373 page_results = page_results[startv:endv]
1374 res.extend(page_results)
1376 # A little optimization - if current page is not "full", ie. does
1377 # not contain page_size videos then we can assume that this page
1378 # is the last one - there are no more ids on further pages -
1379 # i.e. no need to query again.
1380 if len(page_results) + startv < self._pagesize:
1383 # If we got the whole page, but the next page is not interesting,
1384 # break out early as well
1385 if end == nextfirstid:
1390 class InAdvancePagedList(PagedList):
1391 def __init__(self, pagefunc, pagecount, pagesize):
1392 self._pagefunc = pagefunc
1393 self._pagecount = pagecount
1394 self._pagesize = pagesize
1396 def getslice(self, start=0, end=None):
1398 start_page = start // self._pagesize
1400 self._pagecount if end is None else (end // self._pagesize + 1))
1401 skip_elems = start - start_page * self._pagesize
1402 only_more = None if end is None else end - start
1403 for pagenum in range(start_page, end_page):
1404 page = list(self._pagefunc(pagenum))
1406 page = page[skip_elems:]
1408 if only_more is not None:
1409 if len(page) < only_more:
1410 only_more -= len(page)
1412 page = page[:only_more]
1419 def uppercase_escape(s):
1420 unicode_escape = codecs.getdecoder('unicode_escape')
1422 r'\\U[0-9a-fA-F]{8}',
1423 lambda m: unicode_escape(m.group(0))[0],
1427 def escape_rfc3986(s):
1428 """Escape non-ASCII characters as suggested by RFC 3986"""
1429 if sys.version_info < (3, 0) and isinstance(s, unicode):
1430 s = s.encode('utf-8')
1431 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1434 def escape_url(url):
1435 """Escape URL as suggested by RFC 3986"""
1436 url_parsed = compat_urllib_parse_urlparse(url)
1437 return url_parsed._replace(
1438 path=escape_rfc3986(url_parsed.path),
1439 params=escape_rfc3986(url_parsed.params),
1440 query=escape_rfc3986(url_parsed.query),
1441 fragment=escape_rfc3986(url_parsed.fragment)
1445 struct.pack('!I', 0)
1447 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1448 def struct_pack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.pack(spec, *args)
1453 def struct_unpack(spec, *args):
1454 if isinstance(spec, compat_str):
1455 spec = spec.encode('ascii')
1456 return struct.unpack(spec, *args)
1458 struct_pack = struct.pack
1459 struct_unpack = struct.unpack
1462 def read_batch_urls(batch_fd):
1464 if not isinstance(url, compat_str):
1465 url = url.decode('utf-8', 'replace')
1466 BOM_UTF8 = '\xef\xbb\xbf'
1467 if url.startswith(BOM_UTF8):
1468 url = url[len(BOM_UTF8):]
1470 if url.startswith(('#', ';', ']')):
1474 with contextlib.closing(batch_fd) as fd:
1475 return [url for url in map(fixup, fd) if url]
1478 def urlencode_postdata(*args, **kargs):
1479 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1483 etree_iter = xml.etree.ElementTree.Element.iter
1484 except AttributeError: # Python <=2.6
1485 etree_iter = lambda n: n.findall('.//*')
1489 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1490 def doctype(self, name, pubid, system):
1491 pass # Ignore doctypes
1493 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1494 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1495 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1496 # Fix up XML parser in Python 2.x
1497 if sys.version_info < (3, 0):
1498 for n in etree_iter(tree):
1499 if n.text is not None:
1500 if not isinstance(n.text, compat_str):
1501 n.text = n.text.decode('utf-8')
1514 def parse_age_limit(s):
1517 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1518 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1521 def strip_jsonp(code):
1523 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1526 def js_to_json(code):
1529 if v in ('true', 'false', 'null'):
1531 if v.startswith('"'):
1533 if v.startswith("'"):
1535 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1542 res = re.sub(r'''(?x)
1543 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1544 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1545 [a-zA-Z_][a-zA-Z_0-9]*
1547 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1551 def qualities(quality_ids):
1552 """ Get a numeric quality value out of a list of possible values """
1555 return quality_ids.index(qid)
1561 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1564 def limit_length(s, length):
1565 """ Add ellipses to overly long strings """
1570 return s[:length - len(ELLIPSES)] + ELLIPSES
1574 def version_tuple(v):
1575 return tuple(int(e) for e in re.split(r'[-.]', v))
1578 def is_outdated_version(version, limit, assume_new=True):
1580 return not assume_new
1582 return version_tuple(version) < version_tuple(limit)
1584 return not assume_new
1587 def ytdl_is_updateable():
1588 """ Returns if youtube-dl can be updated with -U """
1589 from zipimport import zipimporter
1591 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1594 def args_to_str(args):
1595 # Get a short string representation for a subprocess command
1596 return ' '.join(shlex_quote(a) for a in args)
1599 def urlhandle_detect_ext(url_handle):
1602 getheader = lambda h: url_handle.headers[h]
1603 except AttributeError: # Python < 3
1604 getheader = url_handle.info().getheader
1606 cd = getheader('Content-Disposition')
1608 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1610 e = determine_ext(m.group('filename'), default_ext=None)
1614 return getheader('Content-Type').split("/")[1]
1617 def age_restricted(content_limit, age_limit):
1618 """ Returns True iff the content should be blocked """
1620 if age_limit is None: # No limit set
1622 if content_limit is None:
1623 return False # Content available for everyone
1624 return age_limit < content_limit
1627 def is_html(first_bytes):
1628 """ Detect whether a file contains HTML by examining its first bytes. """
1631 (b'\xef\xbb\xbf', 'utf-8'),
1632 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1633 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1634 (b'\xff\xfe', 'utf-16-le'),
1635 (b'\xfe\xff', 'utf-16-be'),
1637 for bom, enc in BOMS:
1638 if first_bytes.startswith(bom):
1639 s = first_bytes[len(bom):].decode(enc, 'replace')
1642 s = first_bytes.decode('utf-8', 'replace')
1644 return re.match(r'^\s*<', s)
1647 def determine_protocol(info_dict):
1648 protocol = info_dict.get('protocol')
1649 if protocol is not None:
1652 url = info_dict['url']
1653 if url.startswith('rtmp'):
1655 elif url.startswith('mms'):
1657 elif url.startswith('rtsp'):
1660 ext = determine_ext(url)
1666 return compat_urllib_parse_urlparse(url).scheme
1669 def render_table(header_row, data):
1670 """ Render a list of rows, each as a list of values """
1671 table = [header_row] + data
1672 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1673 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1674 return '\n'.join(format_str % tuple(row) for row in table)