2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
65 def preferredencoding():
66 """Get preferred encoding.
68 Returns the best encoding scheme for the system, based on
69 locale.getpreferredencoding() and some further tweaks.
72 pref = locale.getpreferredencoding()
80 def write_json_file(obj, fn):
81 """ Encode obj as JSON and write it to fn, atomically if possible """
83 fn = encodeFilename(fn)
84 if sys.version_info < (3, 0) and sys.platform != 'win32':
85 encoding = get_filesystem_encoding()
86 # os.path.basename returns a bytes object, but NamedTemporaryFile
87 # will fail if the filename contains non ascii characters unless we
88 # use a unicode object
89 path_basename = lambda f: os.path.basename(fn).decode(encoding)
90 # the same for os.path.dirname
91 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
93 path_basename = os.path.basename
94 path_dirname = os.path.dirname
98 'prefix': path_basename(fn) + '.',
99 'dir': path_dirname(fn),
103 # In Python 2.x, json.dump expects a bytestream.
104 # In Python 3.x, it writes to a character stream
105 if sys.version_info < (3, 0):
113 tf = tempfile.NamedTemporaryFile(**args)
118 if sys.platform == 'win32':
119 # Need to remove existing file on Windows, else os.rename raises
120 # WindowsError or FileExistsError.
125 os.rename(tf.name, fn)
134 if sys.version_info >= (2, 7):
135 def find_xpath_attr(node, xpath, key, val):
136 """ Find the xpath xpath[@key=val] """
137 assert re.match(r'^[a-zA-Z-]+$', key)
138 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
139 expr = xpath + "[@%s='%s']" % (key, val)
140 return node.find(expr)
142 def find_xpath_attr(node, xpath, key, val):
143 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
144 # .//node does not match if a node is a direct child of . !
145 if isinstance(xpath, compat_str):
146 xpath = xpath.encode('ascii')
148 for f in node.findall(xpath):
149 if f.attrib.get(key) == val:
153 # On python2.6 the xml.etree.ElementTree.Element methods don't support
154 # the namespace parameter
157 def xpath_with_ns(path, ns_map):
158 components = [c.split(':') for c in path.split('/')]
162 replaced.append(c[0])
165 replaced.append('{%s}%s' % (ns_map[ns], tag))
166 return '/'.join(replaced)
169 def xpath_text(node, xpath, name=None, fatal=False):
170 if sys.version_info < (2, 7): # Crazy 2.6
171 xpath = xpath.encode('ascii')
174 if n is None or n.text is None:
176 name = xpath if name is None else name
177 raise ExtractorError('Could not find XML element %s' % name)
183 def get_element_by_id(id, html):
184 """Return the content of the tag with the specified ID in the passed HTML document"""
185 return get_element_by_attribute("id", id, html)
188 def get_element_by_attribute(attribute, value, html):
189 """Return the content of the tag with the specified attribute in the passed HTML document"""
191 m = re.search(r'''(?xs)
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
195 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 ''' % (re.escape(attribute), re.escape(value)), html)
203 res = m.group('content')
205 if res.startswith('"') or res.startswith("'"):
208 return unescapeHTML(res)
211 def clean_html(html):
212 """Clean an HTML snippet into a readable string"""
214 if html is None: # Convenience for sanitizing descriptions etc.
218 html = html.replace('\n', ' ')
219 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
220 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
222 html = re.sub('<.*?>', '', html)
223 # Replace html entities
224 html = unescapeHTML(html)
228 def sanitize_open(filename, open_mode):
229 """Try to open the given filename, and slightly tweak it if this fails.
231 Attempts to open the given filename. If this fails, it tries to change
232 the filename slightly, step by step, until it's either able to open it
233 or it fails and raises a final exception, like the standard open()
236 It returns the tuple (stream, definitive_file_name).
240 if sys.platform == 'win32':
242 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
243 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
244 stream = open(encodeFilename(filename), open_mode)
245 return (stream, filename)
246 except (IOError, OSError) as err:
247 if err.errno in (errno.EACCES,):
250 # In case of error, try to remove win32 forbidden chars
251 alt_filename = os.path.join(
252 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
253 for path_part in os.path.split(filename)
255 if alt_filename == filename:
258 # An exception here should be caught in the caller
259 stream = open(encodeFilename(filename), open_mode)
260 return (stream, alt_filename)
263 def timeconvert(timestr):
264 """Convert RFC 2822 defined time string into system timestamp"""
266 timetuple = email.utils.parsedate_tz(timestr)
267 if timetuple is not None:
268 timestamp = email.utils.mktime_tz(timetuple)
272 def sanitize_filename(s, restricted=False, is_id=False):
273 """Sanitizes a string so it could be used as part of a filename.
274 If restricted is set, use a stricter subset of allowed characters.
275 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
277 def replace_insane(char):
278 if char == '?' or ord(char) < 32 or ord(char) == 127:
281 return '' if restricted else '\''
283 return '_-' if restricted else ' -'
284 elif char in '\\/|*<>':
286 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
288 if restricted and ord(char) > 127:
293 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
294 result = ''.join(map(replace_insane, s))
296 while '__' in result:
297 result = result.replace('__', '_')
298 result = result.strip('_')
299 # Common case of "Foreign band name - English song title"
300 if restricted and result.startswith('-_'):
307 def orderedSet(iterable):
308 """ Remove all duplicates from the input iterable """
316 def _htmlentity_transform(entity):
317 """Transforms an HTML entity to a character."""
318 # Known non-numeric HTML entity
319 if entity in compat_html_entities.name2codepoint:
320 return compat_chr(compat_html_entities.name2codepoint[entity])
322 mobj = re.match(r'#(x?[0-9]+)', entity)
324 numstr = mobj.group(1)
325 if numstr.startswith('x'):
327 numstr = '0%s' % numstr
330 return compat_chr(int(numstr, base))
332 # Unknown entity in name, return its literal representation
333 return ('&%s;' % entity)
339 assert type(s) == compat_str
342 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
345 def encodeFilename(s, for_subprocess=False):
347 @param s The name of the file
350 assert type(s) == compat_str
352 # Python 3 has a Unicode API
353 if sys.version_info >= (3, 0):
356 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
357 # Pass '' directly to use Unicode APIs on Windows 2000 and up
358 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
359 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
360 if not for_subprocess:
363 # For subprocess calls, encode with locale encoding
364 # Refer to http://stackoverflow.com/a/9951851/35070
365 encoding = preferredencoding()
367 encoding = sys.getfilesystemencoding()
370 return s.encode(encoding, 'ignore')
373 def encodeArgument(s):
374 if not isinstance(s, compat_str):
375 # Legacy code that uses byte strings
376 # Uncomment the following line after fixing all post processors
377 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
378 s = s.decode('ascii')
379 return encodeFilename(s, True)
382 def decodeOption(optval):
385 if isinstance(optval, bytes):
386 optval = optval.decode(preferredencoding())
388 assert isinstance(optval, compat_str)
392 def formatSeconds(secs):
394 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
396 return '%d:%02d' % (secs // 60, secs % 60)
401 def make_HTTPS_handler(params, **kwargs):
402 opts_no_check_certificate = params.get('nocheckcertificate', False)
403 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
404 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
405 if opts_no_check_certificate:
406 context.check_hostname = False
407 context.verify_mode = ssl.CERT_NONE
409 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
412 # (create_default_context present but HTTPSHandler has no context=)
415 if sys.version_info < (3, 2):
416 return YoutubeDLHTTPSHandler(params, **kwargs)
418 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
419 context.verify_mode = (ssl.CERT_NONE
420 if opts_no_check_certificate
421 else ssl.CERT_REQUIRED)
422 context.set_default_verify_paths()
423 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
426 class ExtractorError(Exception):
427 """Error during info extraction."""
429 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
430 """ tb, if given, is the original traceback (so that it can be printed out).
431 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
434 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
436 if video_id is not None:
437 msg = video_id + ': ' + msg
439 msg += ' (caused by %r)' % cause
441 if ytdl_is_updateable():
442 update_cmd = 'type youtube-dl -U to update'
444 update_cmd = 'see https://yt-dl.org/update on how to update'
445 msg += '; please report this issue on https://yt-dl.org/bug .'
446 msg += ' Make sure you are using the latest version; %s.' % update_cmd
447 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
448 super(ExtractorError, self).__init__(msg)
451 self.exc_info = sys.exc_info() # preserve original exception
453 self.video_id = video_id
455 def format_traceback(self):
456 if self.traceback is None:
458 return ''.join(traceback.format_tb(self.traceback))
461 class UnsupportedError(ExtractorError):
462 def __init__(self, url):
463 super(UnsupportedError, self).__init__(
464 'Unsupported URL: %s' % url, expected=True)
468 class RegexNotFoundError(ExtractorError):
469 """Error when a regex didn't match"""
473 class DownloadError(Exception):
474 """Download Error exception.
476 This exception may be thrown by FileDownloader objects if they are not
477 configured to continue on errors. They will contain the appropriate
481 def __init__(self, msg, exc_info=None):
482 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
483 super(DownloadError, self).__init__(msg)
484 self.exc_info = exc_info
487 class SameFileError(Exception):
488 """Same File exception.
490 This exception will be thrown by FileDownloader objects if they detect
491 multiple files would have to be downloaded to the same file on disk.
496 class PostProcessingError(Exception):
497 """Post Processing exception.
499 This exception may be raised by PostProcessor's .run() method to
500 indicate an error in the postprocessing task.
503 def __init__(self, msg):
507 class MaxDownloadsReached(Exception):
508 """ --max-downloads limit has been reached. """
512 class UnavailableVideoError(Exception):
513 """Unavailable Format exception.
515 This exception will be thrown when a video is requested
516 in a format that is not available for that video.
521 class ContentTooShortError(Exception):
522 """Content Too Short exception.
524 This exception may be raised by FileDownloader objects when a file they
525 download is too small for what the server announced first, indicating
526 the connection was probably interrupted.
532 def __init__(self, downloaded, expected):
533 self.downloaded = downloaded
534 self.expected = expected
537 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
538 hc = http_class(*args, **kwargs)
539 source_address = ydl_handler._params.get('source_address')
540 if source_address is not None:
541 sa = (source_address, 0)
542 if hasattr(hc, 'source_address'): # Python 2.7+
543 hc.source_address = sa
545 def _hc_connect(self, *args, **kwargs):
546 sock = compat_socket_create_connection(
547 (self.host, self.port), self.timeout, sa)
549 self.sock = ssl.wrap_socket(
550 sock, self.key_file, self.cert_file,
551 ssl_version=ssl.PROTOCOL_TLSv1)
554 hc.connect = functools.partial(_hc_connect, hc)
559 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
560 """Handler for HTTP requests and responses.
562 This class, when installed with an OpenerDirector, automatically adds
563 the standard headers to every HTTP request and handles gzipped and
564 deflated responses from web servers. If compression is to be avoided in
565 a particular request, the original request in the program code only has
566 to include the HTTP header "Youtubedl-No-Compression", which will be
567 removed before making the real request.
569 Part of this code was copied from:
571 http://techknack.net/python-urllib2-handlers/
573 Andrew Rowls, the author of that code, agreed to release it to the
577 def __init__(self, params, *args, **kwargs):
578 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
579 self._params = params
581 def http_open(self, req):
582 return self.do_open(functools.partial(
583 _create_http_connection, self, compat_http_client.HTTPConnection, False),
589 return zlib.decompress(data, -zlib.MAX_WBITS)
591 return zlib.decompress(data)
594 def addinfourl_wrapper(stream, headers, url, code):
595 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
596 return compat_urllib_request.addinfourl(stream, headers, url, code)
597 ret = compat_urllib_request.addinfourl(stream, headers, url)
601 def http_request(self, req):
602 for h, v in std_headers.items():
603 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
604 # The dict keys are capitalized because of this bug by urllib
605 if h.capitalize() not in req.headers:
607 if 'Youtubedl-no-compression' in req.headers:
608 if 'Accept-encoding' in req.headers:
609 del req.headers['Accept-encoding']
610 del req.headers['Youtubedl-no-compression']
612 if sys.version_info < (2, 7) and '#' in req.get_full_url():
613 # Python 2.6 is brain-dead when it comes to fragments
614 req._Request__original = req._Request__original.partition('#')[0]
615 req._Request__r_type = req._Request__r_type.partition('#')[0]
619 def http_response(self, req, resp):
622 if resp.headers.get('Content-encoding', '') == 'gzip':
623 content = resp.read()
624 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
626 uncompressed = io.BytesIO(gz.read())
627 except IOError as original_ioerror:
628 # There may be junk add the end of the file
629 # See http://stackoverflow.com/q/4928560/35070 for details
630 for i in range(1, 1024):
632 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
633 uncompressed = io.BytesIO(gz.read())
638 raise original_ioerror
639 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
640 resp.msg = old_resp.msg
642 if resp.headers.get('Content-encoding', '') == 'deflate':
643 gz = io.BytesIO(self.deflate(resp.read()))
644 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
645 resp.msg = old_resp.msg
648 https_request = http_request
649 https_response = http_response
652 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
653 def __init__(self, params, https_conn_class=None, *args, **kwargs):
654 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
655 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
656 self._params = params
658 def https_open(self, req):
660 if hasattr(self, '_context'): # python > 2.6
661 kwargs['context'] = self._context
662 if hasattr(self, '_check_hostname'): # python 3.x
663 kwargs['check_hostname'] = self._check_hostname
664 return self.do_open(functools.partial(
665 _create_http_connection, self, self._https_conn_class, True),
669 def parse_iso8601(date_str, delimiter='T', timezone=None):
670 """ Return a UNIX timestamp from the given date """
677 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
680 timezone = datetime.timedelta()
682 date_str = date_str[:-len(m.group(0))]
683 if not m.group('sign'):
684 timezone = datetime.timedelta()
686 sign = 1 if m.group('sign') == '+' else -1
687 timezone = datetime.timedelta(
688 hours=sign * int(m.group('hours')),
689 minutes=sign * int(m.group('minutes')))
690 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
691 dt = datetime.datetime.strptime(date_str, date_format) - timezone
692 return calendar.timegm(dt.timetuple())
695 def unified_strdate(date_str, day_first=True):
696 """Return a string with the date in the format YYYYMMDD"""
702 date_str = date_str.replace(',', ' ')
703 # %z (UTC offset) is only supported in python>=3.2
704 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
705 # Remove AM/PM + timezone
706 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
708 format_expressions = [
713 '%b %dst %Y %I:%M%p',
714 '%b %dnd %Y %I:%M%p',
715 '%b %dth %Y %I:%M%p',
721 '%Y-%m-%d %H:%M:%S.%f',
724 '%Y-%m-%dT%H:%M:%SZ',
725 '%Y-%m-%dT%H:%M:%S.%fZ',
726 '%Y-%m-%dT%H:%M:%S.%f0Z',
728 '%Y-%m-%dT%H:%M:%S.%f',
732 format_expressions.extend([
739 format_expressions.extend([
745 for expression in format_expressions:
747 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
750 if upload_date is None:
751 timetuple = email.utils.parsedate_tz(date_str)
753 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
757 def determine_ext(url, default_ext='unknown_video'):
760 guess = url.partition('?')[0].rpartition('.')[2]
761 if re.match(r'^[A-Za-z0-9]+$', guess):
767 def subtitles_filename(filename, sub_lang, sub_format):
768 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
771 def date_from_str(date_str):
773 Return a datetime object from a string in the format YYYYMMDD or
774 (now|today)[+-][0-9](day|week|month|year)(s)?"""
775 today = datetime.date.today()
776 if date_str in ('now', 'today'):
778 if date_str == 'yesterday':
779 return today - datetime.timedelta(days=1)
780 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
781 if match is not None:
782 sign = match.group('sign')
783 time = int(match.group('time'))
786 unit = match.group('unit')
787 # A bad aproximation?
795 delta = datetime.timedelta(**{unit: time})
797 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
800 def hyphenate_date(date_str):
802 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
803 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
804 if match is not None:
805 return '-'.join(match.groups())
810 class DateRange(object):
811 """Represents a time interval between two dates"""
813 def __init__(self, start=None, end=None):
814 """start and end must be strings in the format accepted by date"""
815 if start is not None:
816 self.start = date_from_str(start)
818 self.start = datetime.datetime.min.date()
820 self.end = date_from_str(end)
822 self.end = datetime.datetime.max.date()
823 if self.start > self.end:
824 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
828 """Returns a range that only contains the given day"""
831 def __contains__(self, date):
832 """Check if the date is in the range"""
833 if not isinstance(date, datetime.date):
834 date = date_from_str(date)
835 return self.start <= date <= self.end
838 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
842 """ Returns the platform name as a compat_str """
843 res = platform.platform()
844 if isinstance(res, bytes):
845 res = res.decode(preferredencoding())
847 assert isinstance(res, compat_str)
851 def _windows_write_string(s, out):
852 """ Returns True if the string was written using special methods,
853 False if it has yet to be written out."""
854 # Adapted from http://stackoverflow.com/a/3259271/35070
857 import ctypes.wintypes
865 fileno = out.fileno()
866 except AttributeError:
867 # If the output stream doesn't have a fileno, it's virtual
869 except io.UnsupportedOperation:
870 # Some strange Windows pseudo files?
872 if fileno not in WIN_OUTPUT_IDS:
875 GetStdHandle = ctypes.WINFUNCTYPE(
876 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
877 (b"GetStdHandle", ctypes.windll.kernel32))
878 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
880 WriteConsoleW = ctypes.WINFUNCTYPE(
881 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
882 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
883 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
884 written = ctypes.wintypes.DWORD(0)
886 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
887 FILE_TYPE_CHAR = 0x0002
888 FILE_TYPE_REMOTE = 0x8000
889 GetConsoleMode = ctypes.WINFUNCTYPE(
890 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
891 ctypes.POINTER(ctypes.wintypes.DWORD))(
892 (b"GetConsoleMode", ctypes.windll.kernel32))
893 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
895 def not_a_console(handle):
896 if handle == INVALID_HANDLE_VALUE or handle is None:
898 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
899 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
904 def next_nonbmp_pos(s):
906 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
907 except StopIteration:
911 count = min(next_nonbmp_pos(s), 1024)
914 h, s, count if count else 2, ctypes.byref(written), None)
916 raise OSError('Failed to write string')
917 if not count: # We just wrote a non-BMP character
918 assert written.value == 2
921 assert written.value > 0
922 s = s[written.value:]
926 def write_string(s, out=None, encoding=None):
929 assert type(s) == compat_str
931 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
932 if _windows_write_string(s, out):
935 if ('b' in getattr(out, 'mode', '') or
936 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
937 byt = s.encode(encoding or preferredencoding(), 'ignore')
939 elif hasattr(out, 'buffer'):
940 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
941 byt = s.encode(enc, 'ignore')
942 out.buffer.write(byt)
948 def bytes_to_intlist(bs):
951 if isinstance(bs[0], int): # Python 3
954 return [ord(c) for c in bs]
957 def intlist_to_bytes(xs):
960 return struct_pack('%dB' % len(xs), *xs)
963 # Cross-platform file locking
964 if sys.platform == 'win32':
965 import ctypes.wintypes
968 class OVERLAPPED(ctypes.Structure):
970 ('Internal', ctypes.wintypes.LPVOID),
971 ('InternalHigh', ctypes.wintypes.LPVOID),
972 ('Offset', ctypes.wintypes.DWORD),
973 ('OffsetHigh', ctypes.wintypes.DWORD),
974 ('hEvent', ctypes.wintypes.HANDLE),
977 kernel32 = ctypes.windll.kernel32
978 LockFileEx = kernel32.LockFileEx
979 LockFileEx.argtypes = [
980 ctypes.wintypes.HANDLE, # hFile
981 ctypes.wintypes.DWORD, # dwFlags
982 ctypes.wintypes.DWORD, # dwReserved
983 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
984 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
985 ctypes.POINTER(OVERLAPPED) # Overlapped
987 LockFileEx.restype = ctypes.wintypes.BOOL
988 UnlockFileEx = kernel32.UnlockFileEx
989 UnlockFileEx.argtypes = [
990 ctypes.wintypes.HANDLE, # hFile
991 ctypes.wintypes.DWORD, # dwReserved
992 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
993 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
994 ctypes.POINTER(OVERLAPPED) # Overlapped
996 UnlockFileEx.restype = ctypes.wintypes.BOOL
997 whole_low = 0xffffffff
998 whole_high = 0x7fffffff
1000 def _lock_file(f, exclusive):
1001 overlapped = OVERLAPPED()
1002 overlapped.Offset = 0
1003 overlapped.OffsetHigh = 0
1004 overlapped.hEvent = 0
1005 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1006 handle = msvcrt.get_osfhandle(f.fileno())
1007 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1008 whole_low, whole_high, f._lock_file_overlapped_p):
1009 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1011 def _unlock_file(f):
1012 assert f._lock_file_overlapped_p
1013 handle = msvcrt.get_osfhandle(f.fileno())
1014 if not UnlockFileEx(handle, 0,
1015 whole_low, whole_high, f._lock_file_overlapped_p):
1016 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1021 def _lock_file(f, exclusive):
1022 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1024 def _unlock_file(f):
1025 fcntl.flock(f, fcntl.LOCK_UN)
1028 class locked_file(object):
1029 def __init__(self, filename, mode, encoding=None):
1030 assert mode in ['r', 'a', 'w']
1031 self.f = io.open(filename, mode, encoding=encoding)
1034 def __enter__(self):
1035 exclusive = self.mode != 'r'
1037 _lock_file(self.f, exclusive)
1043 def __exit__(self, etype, value, traceback):
1045 _unlock_file(self.f)
1052 def write(self, *args):
1053 return self.f.write(*args)
1055 def read(self, *args):
1056 return self.f.read(*args)
1059 def get_filesystem_encoding():
1060 encoding = sys.getfilesystemencoding()
1061 return encoding if encoding is not None else 'utf-8'
1064 def shell_quote(args):
1066 encoding = get_filesystem_encoding()
1068 if isinstance(a, bytes):
1069 # We may get a filename encoded with 'encodeFilename'
1070 a = a.decode(encoding)
1071 quoted_args.append(pipes.quote(a))
1072 return ' '.join(quoted_args)
1075 def takewhile_inclusive(pred, seq):
1076 """ Like itertools.takewhile, but include the latest evaluated element
1077 (the first element so that Not pred(e)) """
1084 def smuggle_url(url, data):
1085 """ Pass additional data in a URL for internal use. """
1087 sdata = compat_urllib_parse.urlencode(
1088 {'__youtubedl_smuggle': json.dumps(data)})
1089 return url + '#' + sdata
1092 def unsmuggle_url(smug_url, default=None):
1093 if '#__youtubedl_smuggle' not in smug_url:
1094 return smug_url, default
1095 url, _, sdata = smug_url.rpartition('#')
1096 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1097 data = json.loads(jsond)
1101 def format_bytes(bytes):
1104 if type(bytes) is str:
1105 bytes = float(bytes)
1109 exponent = int(math.log(bytes, 1024.0))
1110 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1111 converted = float(bytes) / float(1024 ** exponent)
1112 return '%.2f%s' % (converted, suffix)
1115 def parse_filesize(s):
1119 # The lower-case forms are of course incorrect and inofficial,
1120 # but we support those too
1158 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1160 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1164 num_str = m.group('num').replace(',', '.')
1165 mult = _UNIT_TABLE[m.group('unit')]
1166 return int(float(num_str) * mult)
1169 def get_term_width():
1170 columns = compat_getenv('COLUMNS', None)
1175 sp = subprocess.Popen(
1177 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1178 out, err = sp.communicate()
1179 return int(out.split()[1])
1185 def month_by_name(name):
1186 """ Return the number of a month by (locale-independently) English name """
1189 'January', 'February', 'March', 'April', 'May', 'June',
1190 'July', 'August', 'September', 'October', 'November', 'December']
1192 return ENGLISH_NAMES.index(name) + 1
1197 def fix_xml_ampersands(xml_str):
1198 """Replace all the '&' by '&' in XML"""
1200 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1205 def setproctitle(title):
1206 assert isinstance(title, compat_str)
1208 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1211 title_bytes = title.encode('utf-8')
1212 buf = ctypes.create_string_buffer(len(title_bytes))
1213 buf.value = title_bytes
1215 libc.prctl(15, buf, 0, 0, 0)
1216 except AttributeError:
1217 return # Strange libc, just skip this
1220 def remove_start(s, start):
1221 if s.startswith(start):
1222 return s[len(start):]
1226 def remove_end(s, end):
1228 return s[:-len(end)]
1232 def url_basename(url):
1233 path = compat_urlparse.urlparse(url).path
1234 return path.strip('/').split('/')[-1]
1237 class HEADRequest(compat_urllib_request.Request):
1238 def get_method(self):
1242 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1245 v = getattr(v, get_attr, None)
1248 return default if v is None else (int(v) * invscale // scale)
1251 def str_or_none(v, default=None):
1252 return default if v is None else compat_str(v)
1255 def str_to_int(int_str):
1256 """ A more relaxed version of int_or_none """
1259 int_str = re.sub(r'[,\.\+]', '', int_str)
1263 def float_or_none(v, scale=1, invscale=1, default=None):
1264 return default if v is None else (float(v) * invscale / scale)
1267 def parse_duration(s):
1268 if not isinstance(s, compat_basestring):
1276 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1277 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1281 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1282 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1284 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1286 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1291 if m.group('only_mins'):
1292 return float_or_none(m.group('only_mins'), invscale=60)
1293 if m.group('only_hours'):
1294 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1296 res += int(m.group('secs'))
1298 res += int(m.group('mins')) * 60
1299 if m.group('hours'):
1300 res += int(m.group('hours')) * 60 * 60
1302 res += int(m.group('days')) * 24 * 60 * 60
1304 res += float(m.group('ms'))
1308 def prepend_extension(filename, ext):
1309 name, real_ext = os.path.splitext(filename)
1310 return '{0}.{1}{2}'.format(name, ext, real_ext)
1313 def check_executable(exe, args=[]):
1314 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1315 args can be a list of arguments for a short output (like -version) """
1317 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1323 def get_exe_version(exe, args=['--version'],
1324 version_re=None, unrecognized='present'):
1325 """ Returns the version of the specified executable,
1326 or False if the executable is not present """
1328 out, _ = subprocess.Popen(
1330 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1333 if isinstance(out, bytes): # Python 2.x
1334 out = out.decode('ascii', 'ignore')
1335 return detect_exe_version(out, version_re, unrecognized)
1338 def detect_exe_version(output, version_re=None, unrecognized='present'):
1339 assert isinstance(output, compat_str)
1340 if version_re is None:
1341 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1342 m = re.search(version_re, output)
1349 class PagedList(object):
1351 # This is only useful for tests
1352 return len(self.getslice())
1355 class OnDemandPagedList(PagedList):
1356 def __init__(self, pagefunc, pagesize):
1357 self._pagefunc = pagefunc
1358 self._pagesize = pagesize
1360 def getslice(self, start=0, end=None):
1362 for pagenum in itertools.count(start // self._pagesize):
1363 firstid = pagenum * self._pagesize
1364 nextfirstid = pagenum * self._pagesize + self._pagesize
1365 if start >= nextfirstid:
1368 page_results = list(self._pagefunc(pagenum))
1371 start % self._pagesize
1372 if firstid <= start < nextfirstid
1376 ((end - 1) % self._pagesize) + 1
1377 if (end is not None and firstid <= end <= nextfirstid)
1380 if startv != 0 or endv is not None:
1381 page_results = page_results[startv:endv]
1382 res.extend(page_results)
1384 # A little optimization - if current page is not "full", ie. does
1385 # not contain page_size videos then we can assume that this page
1386 # is the last one - there are no more ids on further pages -
1387 # i.e. no need to query again.
1388 if len(page_results) + startv < self._pagesize:
1391 # If we got the whole page, but the next page is not interesting,
1392 # break out early as well
1393 if end == nextfirstid:
1398 class InAdvancePagedList(PagedList):
1399 def __init__(self, pagefunc, pagecount, pagesize):
1400 self._pagefunc = pagefunc
1401 self._pagecount = pagecount
1402 self._pagesize = pagesize
1404 def getslice(self, start=0, end=None):
1406 start_page = start // self._pagesize
1408 self._pagecount if end is None else (end // self._pagesize + 1))
1409 skip_elems = start - start_page * self._pagesize
1410 only_more = None if end is None else end - start
1411 for pagenum in range(start_page, end_page):
1412 page = list(self._pagefunc(pagenum))
1414 page = page[skip_elems:]
1416 if only_more is not None:
1417 if len(page) < only_more:
1418 only_more -= len(page)
1420 page = page[:only_more]
1427 def uppercase_escape(s):
1428 unicode_escape = codecs.getdecoder('unicode_escape')
1430 r'\\U[0-9a-fA-F]{8}',
1431 lambda m: unicode_escape(m.group(0))[0],
1435 def escape_rfc3986(s):
1436 """Escape non-ASCII characters as suggested by RFC 3986"""
1437 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1438 s = s.encode('utf-8')
1439 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1442 def escape_url(url):
1443 """Escape URL as suggested by RFC 3986"""
1444 url_parsed = compat_urllib_parse_urlparse(url)
1445 return url_parsed._replace(
1446 path=escape_rfc3986(url_parsed.path),
1447 params=escape_rfc3986(url_parsed.params),
1448 query=escape_rfc3986(url_parsed.query),
1449 fragment=escape_rfc3986(url_parsed.fragment)
1453 struct.pack('!I', 0)
1455 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1456 def struct_pack(spec, *args):
1457 if isinstance(spec, compat_str):
1458 spec = spec.encode('ascii')
1459 return struct.pack(spec, *args)
1461 def struct_unpack(spec, *args):
1462 if isinstance(spec, compat_str):
1463 spec = spec.encode('ascii')
1464 return struct.unpack(spec, *args)
1466 struct_pack = struct.pack
1467 struct_unpack = struct.unpack
1470 def read_batch_urls(batch_fd):
1472 if not isinstance(url, compat_str):
1473 url = url.decode('utf-8', 'replace')
1474 BOM_UTF8 = '\xef\xbb\xbf'
1475 if url.startswith(BOM_UTF8):
1476 url = url[len(BOM_UTF8):]
1478 if url.startswith(('#', ';', ']')):
1482 with contextlib.closing(batch_fd) as fd:
1483 return [url for url in map(fixup, fd) if url]
1486 def urlencode_postdata(*args, **kargs):
1487 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1491 etree_iter = xml.etree.ElementTree.Element.iter
1492 except AttributeError: # Python <=2.6
1493 etree_iter = lambda n: n.findall('.//*')
1497 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1498 def doctype(self, name, pubid, system):
1499 pass # Ignore doctypes
1501 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1502 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1503 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1504 # Fix up XML parser in Python 2.x
1505 if sys.version_info < (3, 0):
1506 for n in etree_iter(tree):
1507 if n.text is not None:
1508 if not isinstance(n.text, compat_str):
1509 n.text = n.text.decode('utf-8')
1522 def parse_age_limit(s):
1525 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1526 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1529 def strip_jsonp(code):
1531 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1534 def js_to_json(code):
1537 if v in ('true', 'false', 'null'):
1539 if v.startswith('"'):
1541 if v.startswith("'"):
1543 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1550 res = re.sub(r'''(?x)
1551 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1552 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1553 [a-zA-Z_][.a-zA-Z_0-9]*
1555 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1559 def qualities(quality_ids):
1560 """ Get a numeric quality value out of a list of possible values """
1563 return quality_ids.index(qid)
1569 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1572 def limit_length(s, length):
1573 """ Add ellipses to overly long strings """
1578 return s[:length - len(ELLIPSES)] + ELLIPSES
1582 def version_tuple(v):
1583 return tuple(int(e) for e in re.split(r'[-.]', v))
1586 def is_outdated_version(version, limit, assume_new=True):
1588 return not assume_new
1590 return version_tuple(version) < version_tuple(limit)
1592 return not assume_new
1595 def ytdl_is_updateable():
1596 """ Returns if youtube-dl can be updated with -U """
1597 from zipimport import zipimporter
1599 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1602 def args_to_str(args):
1603 # Get a short string representation for a subprocess command
1604 return ' '.join(shlex_quote(a) for a in args)
1607 def urlhandle_detect_ext(url_handle):
1610 getheader = lambda h: url_handle.headers[h]
1611 except AttributeError: # Python < 3
1612 getheader = url_handle.info().getheader
1614 cd = getheader('Content-Disposition')
1616 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1618 e = determine_ext(m.group('filename'), default_ext=None)
1622 return getheader('Content-Type').split("/")[1]
1625 def age_restricted(content_limit, age_limit):
1626 """ Returns True iff the content should be blocked """
1628 if age_limit is None: # No limit set
1630 if content_limit is None:
1631 return False # Content available for everyone
1632 return age_limit < content_limit
1635 def is_html(first_bytes):
1636 """ Detect whether a file contains HTML by examining its first bytes. """
1639 (b'\xef\xbb\xbf', 'utf-8'),
1640 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1641 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1642 (b'\xff\xfe', 'utf-16-le'),
1643 (b'\xfe\xff', 'utf-16-be'),
1645 for bom, enc in BOMS:
1646 if first_bytes.startswith(bom):
1647 s = first_bytes[len(bom):].decode(enc, 'replace')
1650 s = first_bytes.decode('utf-8', 'replace')
1652 return re.match(r'^\s*<', s)
1655 def determine_protocol(info_dict):
1656 protocol = info_dict.get('protocol')
1657 if protocol is not None:
1660 url = info_dict['url']
1661 if url.startswith('rtmp'):
1663 elif url.startswith('mms'):
1665 elif url.startswith('rtsp'):
1668 ext = determine_ext(url)
1674 return compat_urllib_parse_urlparse(url).scheme
1677 def render_table(header_row, data):
1678 """ Render a list of rows, each as a list of values """
1679 table = [header_row] + data
1680 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1681 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1682 return '\n'.join(format_str % tuple(row) for row in table)
1685 def _match_one(filter_part, dct):
1686 COMPARISON_OPERATORS = {
1694 operator_rex = re.compile(r'''(?x)\s*
1696 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1698 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1699 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1702 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1703 m = operator_rex.search(filter_part)
1705 op = COMPARISON_OPERATORS[m.group('op')]
1706 if m.group('strval') is not None:
1707 if m.group('op') not in ('=', '!='):
1709 'Operator %s does not support string values!' % m.group('op'))
1710 comparison_value = m.group('strval')
1713 comparison_value = int(m.group('intval'))
1715 comparison_value = parse_filesize(m.group('intval'))
1716 if comparison_value is None:
1717 comparison_value = parse_filesize(m.group('intval') + 'B')
1718 if comparison_value is None:
1720 'Invalid integer value %r in filter part %r' % (
1721 m.group('intval'), filter_part))
1722 actual_value = dct.get(m.group('key'))
1723 if actual_value is None:
1724 return m.group('none_inclusive')
1725 return op(actual_value, comparison_value)
1728 '': lambda v: v is not None,
1729 '!': lambda v: v is None,
1731 operator_rex = re.compile(r'''(?x)\s*
1732 (?P<op>%s)\s*(?P<key>[a-z_]+)
1734 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1735 m = operator_rex.search(filter_part)
1737 op = UNARY_OPERATORS[m.group('op')]
1738 actual_value = dct.get(m.group('key'))
1739 return op(actual_value)
1741 raise ValueError('Invalid filter part %r' % filter_part)
1744 def match_str(filter_str, dct):
1745 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1748 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1751 def match_filter_func(filter_str):
1752 def _match_func(info_dict):
1753 if match_str(filter_str, info_dict):
1756 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1757 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)