2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
31 import xml.etree.ElementTree
40 compat_socket_create_connection,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
51 # This is not clearly defined otherwise
52 compiled_regex_type = type(re.compile(''))
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
63 def preferredencoding():
64 """Get preferred encoding.
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
70 pref = locale.getpreferredencoding()
78 def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
111 tf = tempfile.NamedTemporaryFile(**args)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
123 os.rename(tf.name, fn)
132 if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
152 # the namespace parameter
155 def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
160 replaced.append(c[0])
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
167 def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
172 if n is None or n.text is None:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
181 def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
186 def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
189 m = re.search(r'''(?xs)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
197 ''' % (re.escape(attribute), re.escape(value)), html)
201 res = m.group('content')
203 if res.startswith('"') or res.startswith("'"):
206 return unescapeHTML(res)
209 def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
212 if html is None: # Convenience for sanitizing descriptions etc.
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
226 def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
234 It returns the tuple (stream, definitive_file_name).
238 if sys.platform == 'win32':
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
253 if alt_filename == filename:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
261 def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
270 def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return '' if restricted else '\''
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
286 if restricted and ord(char) > 127:
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
305 def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
314 def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
320 mobj = re.match(r'#(x?[0-9]+)', entity)
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
325 numstr = '0%s' % numstr
328 return compat_chr(int(numstr, base))
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
337 assert type(s) == compat_str
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343 def encodeFilename(s, for_subprocess=False):
345 @param s The name of the file
348 assert type(s) == compat_str
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
365 encoding = sys.getfilesystemencoding()
368 return s.encode(encoding, 'ignore')
371 def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
380 def decodeOption(optval):
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
386 assert isinstance(optval, compat_str)
390 def formatSeconds(secs):
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
394 return '%d:%02d' % (secs // 60, secs % 60)
399 def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
403 if opts_no_check_certificate:
404 context.check_hostname = False
405 context.verify_mode = ssl.CERT_NONE
407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
410 # (create_default_context present but HTTPSHandler has no context=)
413 if sys.version_info < (3, 2):
414 return YoutubeDLHTTPSHandler(params, **kwargs)
416 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
417 context.verify_mode = (ssl.CERT_NONE
418 if opts_no_check_certificate
419 else ssl.CERT_REQUIRED)
420 context.set_default_verify_paths()
421 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
424 class ExtractorError(Exception):
425 """Error during info extraction."""
427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
434 if video_id is not None:
435 msg = video_id + ': ' + msg
437 msg += ' (caused by %r)' % cause
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
446 super(ExtractorError, self).__init__(msg)
449 self.exc_info = sys.exc_info() # preserve original exception
451 self.video_id = video_id
453 def format_traceback(self):
454 if self.traceback is None:
456 return ''.join(traceback.format_tb(self.traceback))
459 class UnsupportedError(ExtractorError):
460 def __init__(self, url):
461 super(UnsupportedError, self).__init__(
462 'Unsupported URL: %s' % url, expected=True)
466 class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
471 class DownloadError(Exception):
472 """Download Error exception.
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
485 class SameFileError(Exception):
486 """Same File exception.
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
494 class PostProcessingError(Exception):
495 """Post Processing exception.
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
501 def __init__(self, msg):
505 class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
510 class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
519 class ContentTooShortError(Exception):
520 """Content Too Short exception.
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
535 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
536 hc = http_class(*args, **kwargs)
537 source_address = ydl_handler._params.get('source_address')
538 if source_address is not None:
539 sa = (source_address, 0)
540 if hasattr(hc, 'source_address'): # Python 2.7+
541 hc.source_address = sa
543 def _hc_connect(self, *args, **kwargs):
544 sock = compat_socket_create_connection(
545 (self.host, self.port), self.timeout, sa)
547 self.sock = ssl.wrap_socket(
548 sock, self.key_file, self.cert_file,
549 ssl_version=ssl.PROTOCOL_TLSv1)
552 hc.connect = functools.partial(_hc_connect, hc)
557 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
558 """Handler for HTTP requests and responses.
560 This class, when installed with an OpenerDirector, automatically adds
561 the standard headers to every HTTP request and handles gzipped and
562 deflated responses from web servers. If compression is to be avoided in
563 a particular request, the original request in the program code only has
564 to include the HTTP header "Youtubedl-No-Compression", which will be
565 removed before making the real request.
567 Part of this code was copied from:
569 http://techknack.net/python-urllib2-handlers/
571 Andrew Rowls, the author of that code, agreed to release it to the
575 def __init__(self, params, *args, **kwargs):
576 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
577 self._params = params
579 def http_open(self, req):
580 return self.do_open(functools.partial(
581 _create_http_connection, self, compat_http_client.HTTPConnection, False),
587 return zlib.decompress(data, -zlib.MAX_WBITS)
589 return zlib.decompress(data)
592 def addinfourl_wrapper(stream, headers, url, code):
593 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
594 return compat_urllib_request.addinfourl(stream, headers, url, code)
595 ret = compat_urllib_request.addinfourl(stream, headers, url)
599 def http_request(self, req):
600 for h, v in std_headers.items():
601 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
602 # The dict keys are capitalized because of this bug by urllib
603 if h.capitalize() not in req.headers:
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
610 if sys.version_info < (2, 7) and '#' in req.get_full_url():
611 # Python 2.6 is brain-dead when it comes to fragments
612 req._Request__original = req._Request__original.partition('#')[0]
613 req._Request__r_type = req._Request__r_type.partition('#')[0]
617 def http_response(self, req, resp):
620 if resp.headers.get('Content-encoding', '') == 'gzip':
621 content = resp.read()
622 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
624 uncompressed = io.BytesIO(gz.read())
625 except IOError as original_ioerror:
626 # There may be junk add the end of the file
627 # See http://stackoverflow.com/q/4928560/35070 for details
628 for i in range(1, 1024):
630 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
631 uncompressed = io.BytesIO(gz.read())
636 raise original_ioerror
637 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
638 resp.msg = old_resp.msg
640 if resp.headers.get('Content-encoding', '') == 'deflate':
641 gz = io.BytesIO(self.deflate(resp.read()))
642 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
643 resp.msg = old_resp.msg
646 https_request = http_request
647 https_response = http_response
650 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
651 def __init__(self, params, https_conn_class=None, *args, **kwargs):
652 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
653 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
654 self._params = params
656 def https_open(self, req):
657 return self.do_open(functools.partial(
658 _create_http_connection, self, self._https_conn_class, True),
662 def parse_iso8601(date_str, delimiter='T'):
663 """ Return a UNIX timestamp from the given date """
669 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
672 timezone = datetime.timedelta()
674 date_str = date_str[:-len(m.group(0))]
675 if not m.group('sign'):
676 timezone = datetime.timedelta()
678 sign = 1 if m.group('sign') == '+' else -1
679 timezone = datetime.timedelta(
680 hours=sign * int(m.group('hours')),
681 minutes=sign * int(m.group('minutes')))
682 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
683 dt = datetime.datetime.strptime(date_str, date_format) - timezone
684 return calendar.timegm(dt.timetuple())
687 def unified_strdate(date_str, day_first=True):
688 """Return a string with the date in the format YYYYMMDD"""
694 date_str = date_str.replace(',', ' ')
695 # %z (UTC offset) is only supported in python>=3.2
696 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
697 # Remove AM/PM + timezone
698 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
700 format_expressions = [
705 '%b %dst %Y %I:%M%p',
706 '%b %dnd %Y %I:%M%p',
707 '%b %dth %Y %I:%M%p',
713 '%Y-%m-%d %H:%M:%S.%f',
716 '%Y-%m-%dT%H:%M:%SZ',
717 '%Y-%m-%dT%H:%M:%S.%fZ',
718 '%Y-%m-%dT%H:%M:%S.%f0Z',
720 '%Y-%m-%dT%H:%M:%S.%f',
724 format_expressions.extend([
731 format_expressions.extend([
737 for expression in format_expressions:
739 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
742 if upload_date is None:
743 timetuple = email.utils.parsedate_tz(date_str)
745 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
749 def determine_ext(url, default_ext='unknown_video'):
752 guess = url.partition('?')[0].rpartition('.')[2]
753 if re.match(r'^[A-Za-z0-9]+$', guess):
759 def subtitles_filename(filename, sub_lang, sub_format):
760 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
763 def date_from_str(date_str):
765 Return a datetime object from a string in the format YYYYMMDD or
766 (now|today)[+-][0-9](day|week|month|year)(s)?"""
767 today = datetime.date.today()
768 if date_str in ('now', 'today'):
770 if date_str == 'yesterday':
771 return today - datetime.timedelta(days=1)
772 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
773 if match is not None:
774 sign = match.group('sign')
775 time = int(match.group('time'))
778 unit = match.group('unit')
779 # A bad aproximation?
787 delta = datetime.timedelta(**{unit: time})
789 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
792 def hyphenate_date(date_str):
794 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
795 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
796 if match is not None:
797 return '-'.join(match.groups())
802 class DateRange(object):
803 """Represents a time interval between two dates"""
805 def __init__(self, start=None, end=None):
806 """start and end must be strings in the format accepted by date"""
807 if start is not None:
808 self.start = date_from_str(start)
810 self.start = datetime.datetime.min.date()
812 self.end = date_from_str(end)
814 self.end = datetime.datetime.max.date()
815 if self.start > self.end:
816 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
820 """Returns a range that only contains the given day"""
823 def __contains__(self, date):
824 """Check if the date is in the range"""
825 if not isinstance(date, datetime.date):
826 date = date_from_str(date)
827 return self.start <= date <= self.end
830 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
834 """ Returns the platform name as a compat_str """
835 res = platform.platform()
836 if isinstance(res, bytes):
837 res = res.decode(preferredencoding())
839 assert isinstance(res, compat_str)
843 def _windows_write_string(s, out):
844 """ Returns True if the string was written using special methods,
845 False if it has yet to be written out."""
846 # Adapted from http://stackoverflow.com/a/3259271/35070
849 import ctypes.wintypes
857 fileno = out.fileno()
858 except AttributeError:
859 # If the output stream doesn't have a fileno, it's virtual
861 except io.UnsupportedOperation:
862 # Some strange Windows pseudo files?
864 if fileno not in WIN_OUTPUT_IDS:
867 GetStdHandle = ctypes.WINFUNCTYPE(
868 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
869 (b"GetStdHandle", ctypes.windll.kernel32))
870 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
872 WriteConsoleW = ctypes.WINFUNCTYPE(
873 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
874 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
875 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
876 written = ctypes.wintypes.DWORD(0)
878 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
879 FILE_TYPE_CHAR = 0x0002
880 FILE_TYPE_REMOTE = 0x8000
881 GetConsoleMode = ctypes.WINFUNCTYPE(
882 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
883 ctypes.POINTER(ctypes.wintypes.DWORD))(
884 (b"GetConsoleMode", ctypes.windll.kernel32))
885 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
887 def not_a_console(handle):
888 if handle == INVALID_HANDLE_VALUE or handle is None:
890 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
891 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
896 def next_nonbmp_pos(s):
898 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
899 except StopIteration:
903 count = min(next_nonbmp_pos(s), 1024)
906 h, s, count if count else 2, ctypes.byref(written), None)
908 raise OSError('Failed to write string')
909 if not count: # We just wrote a non-BMP character
910 assert written.value == 2
913 assert written.value > 0
914 s = s[written.value:]
918 def write_string(s, out=None, encoding=None):
921 assert type(s) == compat_str
923 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
924 if _windows_write_string(s, out):
927 if ('b' in getattr(out, 'mode', '') or
928 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
929 byt = s.encode(encoding or preferredencoding(), 'ignore')
931 elif hasattr(out, 'buffer'):
932 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
933 byt = s.encode(enc, 'ignore')
934 out.buffer.write(byt)
940 def bytes_to_intlist(bs):
943 if isinstance(bs[0], int): # Python 3
946 return [ord(c) for c in bs]
949 def intlist_to_bytes(xs):
952 return struct_pack('%dB' % len(xs), *xs)
955 # Cross-platform file locking
956 if sys.platform == 'win32':
957 import ctypes.wintypes
960 class OVERLAPPED(ctypes.Structure):
962 ('Internal', ctypes.wintypes.LPVOID),
963 ('InternalHigh', ctypes.wintypes.LPVOID),
964 ('Offset', ctypes.wintypes.DWORD),
965 ('OffsetHigh', ctypes.wintypes.DWORD),
966 ('hEvent', ctypes.wintypes.HANDLE),
969 kernel32 = ctypes.windll.kernel32
970 LockFileEx = kernel32.LockFileEx
971 LockFileEx.argtypes = [
972 ctypes.wintypes.HANDLE, # hFile
973 ctypes.wintypes.DWORD, # dwFlags
974 ctypes.wintypes.DWORD, # dwReserved
975 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
976 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
977 ctypes.POINTER(OVERLAPPED) # Overlapped
979 LockFileEx.restype = ctypes.wintypes.BOOL
980 UnlockFileEx = kernel32.UnlockFileEx
981 UnlockFileEx.argtypes = [
982 ctypes.wintypes.HANDLE, # hFile
983 ctypes.wintypes.DWORD, # dwReserved
984 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
985 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
986 ctypes.POINTER(OVERLAPPED) # Overlapped
988 UnlockFileEx.restype = ctypes.wintypes.BOOL
989 whole_low = 0xffffffff
990 whole_high = 0x7fffffff
992 def _lock_file(f, exclusive):
993 overlapped = OVERLAPPED()
994 overlapped.Offset = 0
995 overlapped.OffsetHigh = 0
996 overlapped.hEvent = 0
997 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
998 handle = msvcrt.get_osfhandle(f.fileno())
999 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1000 whole_low, whole_high, f._lock_file_overlapped_p):
1001 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1003 def _unlock_file(f):
1004 assert f._lock_file_overlapped_p
1005 handle = msvcrt.get_osfhandle(f.fileno())
1006 if not UnlockFileEx(handle, 0,
1007 whole_low, whole_high, f._lock_file_overlapped_p):
1008 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1013 def _lock_file(f, exclusive):
1014 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1016 def _unlock_file(f):
1017 fcntl.flock(f, fcntl.LOCK_UN)
1020 class locked_file(object):
1021 def __init__(self, filename, mode, encoding=None):
1022 assert mode in ['r', 'a', 'w']
1023 self.f = io.open(filename, mode, encoding=encoding)
1026 def __enter__(self):
1027 exclusive = self.mode != 'r'
1029 _lock_file(self.f, exclusive)
1035 def __exit__(self, etype, value, traceback):
1037 _unlock_file(self.f)
1044 def write(self, *args):
1045 return self.f.write(*args)
1047 def read(self, *args):
1048 return self.f.read(*args)
1051 def get_filesystem_encoding():
1052 encoding = sys.getfilesystemencoding()
1053 return encoding if encoding is not None else 'utf-8'
1056 def shell_quote(args):
1058 encoding = get_filesystem_encoding()
1060 if isinstance(a, bytes):
1061 # We may get a filename encoded with 'encodeFilename'
1062 a = a.decode(encoding)
1063 quoted_args.append(pipes.quote(a))
1064 return ' '.join(quoted_args)
1067 def takewhile_inclusive(pred, seq):
1068 """ Like itertools.takewhile, but include the latest evaluated element
1069 (the first element so that Not pred(e)) """
1076 def smuggle_url(url, data):
1077 """ Pass additional data in a URL for internal use. """
1079 sdata = compat_urllib_parse.urlencode(
1080 {'__youtubedl_smuggle': json.dumps(data)})
1081 return url + '#' + sdata
1084 def unsmuggle_url(smug_url, default=None):
1085 if '#__youtubedl_smuggle' not in smug_url:
1086 return smug_url, default
1087 url, _, sdata = smug_url.rpartition('#')
1088 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1089 data = json.loads(jsond)
1093 def format_bytes(bytes):
1096 if type(bytes) is str:
1097 bytes = float(bytes)
1101 exponent = int(math.log(bytes, 1024.0))
1102 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1103 converted = float(bytes) / float(1024 ** exponent)
1104 return '%.2f%s' % (converted, suffix)
1107 def parse_filesize(s):
1111 # The lower-case forms are of course incorrect and inofficial,
1112 # but we support those too
1150 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1152 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1156 num_str = m.group('num').replace(',', '.')
1157 mult = _UNIT_TABLE[m.group('unit')]
1158 return int(float(num_str) * mult)
1161 def get_term_width():
1162 columns = compat_getenv('COLUMNS', None)
1167 sp = subprocess.Popen(
1169 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1170 out, err = sp.communicate()
1171 return int(out.split()[1])
1177 def month_by_name(name):
1178 """ Return the number of a month by (locale-independently) English name """
1181 'January', 'February', 'March', 'April', 'May', 'June',
1182 'July', 'August', 'September', 'October', 'November', 'December']
1184 return ENGLISH_NAMES.index(name) + 1
1189 def fix_xml_ampersands(xml_str):
1190 """Replace all the '&' by '&' in XML"""
1192 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1197 def setproctitle(title):
1198 assert isinstance(title, compat_str)
1200 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1203 title_bytes = title.encode('utf-8')
1204 buf = ctypes.create_string_buffer(len(title_bytes))
1205 buf.value = title_bytes
1207 libc.prctl(15, buf, 0, 0, 0)
1208 except AttributeError:
1209 return # Strange libc, just skip this
1212 def remove_start(s, start):
1213 if s.startswith(start):
1214 return s[len(start):]
1218 def remove_end(s, end):
1220 return s[:-len(end)]
1224 def url_basename(url):
1225 path = compat_urlparse.urlparse(url).path
1226 return path.strip('/').split('/')[-1]
1229 class HEADRequest(compat_urllib_request.Request):
1230 def get_method(self):
1234 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1237 v = getattr(v, get_attr, None)
1240 return default if v is None else (int(v) * invscale // scale)
1243 def str_or_none(v, default=None):
1244 return default if v is None else compat_str(v)
1247 def str_to_int(int_str):
1248 """ A more relaxed version of int_or_none """
1251 int_str = re.sub(r'[,\.\+]', '', int_str)
1255 def float_or_none(v, scale=1, invscale=1, default=None):
1256 return default if v is None else (float(v) * invscale / scale)
1259 def parse_duration(s):
1260 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1268 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1269 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1272 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1273 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1275 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1280 if m.group('only_mins'):
1281 return float_or_none(m.group('only_mins'), invscale=60)
1282 if m.group('only_hours'):
1283 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1285 res += int(m.group('secs'))
1287 res += int(m.group('mins')) * 60
1288 if m.group('hours'):
1289 res += int(m.group('hours')) * 60 * 60
1291 res += float(m.group('ms'))
1295 def prepend_extension(filename, ext):
1296 name, real_ext = os.path.splitext(filename)
1297 return '{0}.{1}{2}'.format(name, ext, real_ext)
1300 def check_executable(exe, args=[]):
1301 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1302 args can be a list of arguments for a short output (like -version) """
1304 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1310 def get_exe_version(exe, args=['--version'],
1311 version_re=None, unrecognized='present'):
1312 """ Returns the version of the specified executable,
1313 or False if the executable is not present """
1315 out, _ = subprocess.Popen(
1317 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1320 if isinstance(out, bytes): # Python 2.x
1321 out = out.decode('ascii', 'ignore')
1322 return detect_exe_version(out, version_re, unrecognized)
1325 def detect_exe_version(output, version_re=None, unrecognized='present'):
1326 assert isinstance(output, compat_str)
1327 if version_re is None:
1328 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1329 m = re.search(version_re, output)
1336 class PagedList(object):
1338 # This is only useful for tests
1339 return len(self.getslice())
1342 class OnDemandPagedList(PagedList):
1343 def __init__(self, pagefunc, pagesize):
1344 self._pagefunc = pagefunc
1345 self._pagesize = pagesize
1347 def getslice(self, start=0, end=None):
1349 for pagenum in itertools.count(start // self._pagesize):
1350 firstid = pagenum * self._pagesize
1351 nextfirstid = pagenum * self._pagesize + self._pagesize
1352 if start >= nextfirstid:
1355 page_results = list(self._pagefunc(pagenum))
1358 start % self._pagesize
1359 if firstid <= start < nextfirstid
1363 ((end - 1) % self._pagesize) + 1
1364 if (end is not None and firstid <= end <= nextfirstid)
1367 if startv != 0 or endv is not None:
1368 page_results = page_results[startv:endv]
1369 res.extend(page_results)
1371 # A little optimization - if current page is not "full", ie. does
1372 # not contain page_size videos then we can assume that this page
1373 # is the last one - there are no more ids on further pages -
1374 # i.e. no need to query again.
1375 if len(page_results) + startv < self._pagesize:
1378 # If we got the whole page, but the next page is not interesting,
1379 # break out early as well
1380 if end == nextfirstid:
1385 class InAdvancePagedList(PagedList):
1386 def __init__(self, pagefunc, pagecount, pagesize):
1387 self._pagefunc = pagefunc
1388 self._pagecount = pagecount
1389 self._pagesize = pagesize
1391 def getslice(self, start=0, end=None):
1393 start_page = start // self._pagesize
1395 self._pagecount if end is None else (end // self._pagesize + 1))
1396 skip_elems = start - start_page * self._pagesize
1397 only_more = None if end is None else end - start
1398 for pagenum in range(start_page, end_page):
1399 page = list(self._pagefunc(pagenum))
1401 page = page[skip_elems:]
1403 if only_more is not None:
1404 if len(page) < only_more:
1405 only_more -= len(page)
1407 page = page[:only_more]
1414 def uppercase_escape(s):
1415 unicode_escape = codecs.getdecoder('unicode_escape')
1417 r'\\U[0-9a-fA-F]{8}',
1418 lambda m: unicode_escape(m.group(0))[0],
1422 def escape_rfc3986(s):
1423 """Escape non-ASCII characters as suggested by RFC 3986"""
1424 if sys.version_info < (3, 0) and isinstance(s, unicode):
1425 s = s.encode('utf-8')
1426 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1429 def escape_url(url):
1430 """Escape URL as suggested by RFC 3986"""
1431 url_parsed = compat_urllib_parse_urlparse(url)
1432 return url_parsed._replace(
1433 path=escape_rfc3986(url_parsed.path),
1434 params=escape_rfc3986(url_parsed.params),
1435 query=escape_rfc3986(url_parsed.query),
1436 fragment=escape_rfc3986(url_parsed.fragment)
1440 struct.pack('!I', 0)
1442 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1443 def struct_pack(spec, *args):
1444 if isinstance(spec, compat_str):
1445 spec = spec.encode('ascii')
1446 return struct.pack(spec, *args)
1448 def struct_unpack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.unpack(spec, *args)
1453 struct_pack = struct.pack
1454 struct_unpack = struct.unpack
1457 def read_batch_urls(batch_fd):
1459 if not isinstance(url, compat_str):
1460 url = url.decode('utf-8', 'replace')
1461 BOM_UTF8 = '\xef\xbb\xbf'
1462 if url.startswith(BOM_UTF8):
1463 url = url[len(BOM_UTF8):]
1465 if url.startswith(('#', ';', ']')):
1469 with contextlib.closing(batch_fd) as fd:
1470 return [url for url in map(fixup, fd) if url]
1473 def urlencode_postdata(*args, **kargs):
1474 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1478 etree_iter = xml.etree.ElementTree.Element.iter
1479 except AttributeError: # Python <=2.6
1480 etree_iter = lambda n: n.findall('.//*')
1484 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1485 def doctype(self, name, pubid, system):
1486 pass # Ignore doctypes
1488 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1489 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1490 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1491 # Fix up XML parser in Python 2.x
1492 if sys.version_info < (3, 0):
1493 for n in etree_iter(tree):
1494 if n.text is not None:
1495 if not isinstance(n.text, compat_str):
1496 n.text = n.text.decode('utf-8')
1509 def parse_age_limit(s):
1512 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1513 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1516 def strip_jsonp(code):
1518 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1521 def js_to_json(code):
1524 if v in ('true', 'false', 'null'):
1526 if v.startswith('"'):
1528 if v.startswith("'"):
1530 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1537 res = re.sub(r'''(?x)
1538 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1539 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1540 [a-zA-Z_][a-zA-Z_0-9]*
1542 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1546 def qualities(quality_ids):
1547 """ Get a numeric quality value out of a list of possible values """
1550 return quality_ids.index(qid)
1556 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1559 def limit_length(s, length):
1560 """ Add ellipses to overly long strings """
1565 return s[:length - len(ELLIPSES)] + ELLIPSES
1569 def version_tuple(v):
1570 return tuple(int(e) for e in re.split(r'[-.]', v))
1573 def is_outdated_version(version, limit, assume_new=True):
1575 return not assume_new
1577 return version_tuple(version) < version_tuple(limit)
1579 return not assume_new
1582 def ytdl_is_updateable():
1583 """ Returns if youtube-dl can be updated with -U """
1584 from zipimport import zipimporter
1586 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1589 def args_to_str(args):
1590 # Get a short string representation for a subprocess command
1591 return ' '.join(shlex_quote(a) for a in args)
1594 def urlhandle_detect_ext(url_handle):
1597 getheader = lambda h: url_handle.headers[h]
1598 except AttributeError: # Python < 3
1599 getheader = url_handle.info().getheader
1601 cd = getheader('Content-Disposition')
1603 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1605 e = determine_ext(m.group('filename'), default_ext=None)
1609 return getheader('Content-Type').split("/")[1]
1612 def age_restricted(content_limit, age_limit):
1613 """ Returns True iff the content should be blocked """
1615 if age_limit is None: # No limit set
1617 if content_limit is None:
1618 return False # Content available for everyone
1619 return age_limit < content_limit
1622 def is_html(first_bytes):
1623 """ Detect whether a file contains HTML by examining its first bytes. """
1626 (b'\xef\xbb\xbf', 'utf-8'),
1627 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1628 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1629 (b'\xff\xfe', 'utf-16-le'),
1630 (b'\xfe\xff', 'utf-16-be'),
1632 for bom, enc in BOMS:
1633 if first_bytes.startswith(bom):
1634 s = first_bytes[len(bom):].decode(enc, 'replace')
1637 s = first_bytes.decode('utf-8', 'replace')
1639 return re.match(r'^\s*<', s)
1642 def determine_protocol(info_dict):
1643 protocol = info_dict.get('protocol')
1644 if protocol is not None:
1647 url = info_dict['url']
1648 if url.startswith('rtmp'):
1650 elif url.startswith('mms'):
1652 elif url.startswith('rtsp'):
1655 ext = determine_ext(url)
1661 return compat_urllib_parse_urlparse(url).scheme
1664 def render_table(header_row, data):
1665 """ Render a list of rows, each as a list of values """
1666 table = [header_row] + data
1667 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1668 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1669 return '\n'.join(format_str % tuple(row) for row in table)