2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
49 # This is not clearly defined otherwise
50 compiled_regex_type = type(re.compile(''))
53 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
54 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
55 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
56 'Accept-Encoding': 'gzip, deflate',
57 'Accept-Language': 'en-us,en;q=0.5',
61 def preferredencoding():
62 """Get preferred encoding.
64 Returns the best encoding scheme for the system, based on
65 locale.getpreferredencoding() and some further tweaks.
68 pref = locale.getpreferredencoding()
76 def write_json_file(obj, fn):
77 """ Encode obj as JSON and write it to fn, atomically if possible """
79 fn = encodeFilename(fn)
80 if sys.version_info < (3, 0) and sys.platform != 'win32':
81 encoding = get_filesystem_encoding()
82 # os.path.basename returns a bytes object, but NamedTemporaryFile
83 # will fail if the filename contains non ascii characters unless we
84 # use a unicode object
85 path_basename = lambda f: os.path.basename(fn).decode(encoding)
86 # the same for os.path.dirname
87 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
89 path_basename = os.path.basename
90 path_dirname = os.path.dirname
94 'prefix': path_basename(fn) + '.',
95 'dir': path_dirname(fn),
99 # In Python 2.x, json.dump expects a bytestream.
100 # In Python 3.x, it writes to a character stream
101 if sys.version_info < (3, 0):
109 tf = tempfile.NamedTemporaryFile(**args)
114 if sys.platform == 'win32':
115 # Need to remove existing file on Windows, else os.rename raises
116 # WindowsError or FileExistsError.
121 os.rename(tf.name, fn)
130 if sys.version_info >= (2, 7):
131 def find_xpath_attr(node, xpath, key, val):
132 """ Find the xpath xpath[@key=val] """
133 assert re.match(r'^[a-zA-Z-]+$', key)
134 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
135 expr = xpath + "[@%s='%s']" % (key, val)
136 return node.find(expr)
138 def find_xpath_attr(node, xpath, key, val):
139 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
140 # .//node does not match if a node is a direct child of . !
141 if isinstance(xpath, unicode):
142 xpath = xpath.encode('ascii')
144 for f in node.findall(xpath):
145 if f.attrib.get(key) == val:
149 # On python2.6 the xml.etree.ElementTree.Element methods don't support
150 # the namespace parameter
153 def xpath_with_ns(path, ns_map):
154 components = [c.split(':') for c in path.split('/')]
158 replaced.append(c[0])
161 replaced.append('{%s}%s' % (ns_map[ns], tag))
162 return '/'.join(replaced)
165 def xpath_text(node, xpath, name=None, fatal=False):
166 if sys.version_info < (2, 7): # Crazy 2.6
167 xpath = xpath.encode('ascii')
170 if n is None or n.text is None:
172 name = xpath if name is None else name
173 raise ExtractorError('Could not find XML element %s' % name)
179 def get_element_by_id(id, html):
180 """Return the content of the tag with the specified ID in the passed HTML document"""
181 return get_element_by_attribute("id", id, html)
184 def get_element_by_attribute(attribute, value, html):
185 """Return the content of the tag with the specified attribute in the passed HTML document"""
187 m = re.search(r'''(?xs)
189 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
195 ''' % (re.escape(attribute), re.escape(value)), html)
199 res = m.group('content')
201 if res.startswith('"') or res.startswith("'"):
204 return unescapeHTML(res)
207 def clean_html(html):
208 """Clean an HTML snippet into a readable string"""
210 html = html.replace('\n', ' ')
211 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
212 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
214 html = re.sub('<.*?>', '', html)
215 # Replace html entities
216 html = unescapeHTML(html)
220 def sanitize_open(filename, open_mode):
221 """Try to open the given filename, and slightly tweak it if this fails.
223 Attempts to open the given filename. If this fails, it tries to change
224 the filename slightly, step by step, until it's either able to open it
225 or it fails and raises a final exception, like the standard open()
228 It returns the tuple (stream, definitive_file_name).
232 if sys.platform == 'win32':
234 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
235 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
236 stream = open(encodeFilename(filename), open_mode)
237 return (stream, filename)
238 except (IOError, OSError) as err:
239 if err.errno in (errno.EACCES,):
242 # In case of error, try to remove win32 forbidden chars
243 alt_filename = os.path.join(
244 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
245 for path_part in os.path.split(filename)
247 if alt_filename == filename:
250 # An exception here should be caught in the caller
251 stream = open(encodeFilename(filename), open_mode)
252 return (stream, alt_filename)
255 def timeconvert(timestr):
256 """Convert RFC 2822 defined time string into system timestamp"""
258 timetuple = email.utils.parsedate_tz(timestr)
259 if timetuple is not None:
260 timestamp = email.utils.mktime_tz(timetuple)
264 def sanitize_filename(s, restricted=False, is_id=False):
265 """Sanitizes a string so it could be used as part of a filename.
266 If restricted is set, use a stricter subset of allowed characters.
267 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
269 def replace_insane(char):
270 if char == '?' or ord(char) < 32 or ord(char) == 127:
273 return '' if restricted else '\''
275 return '_-' if restricted else ' -'
276 elif char in '\\/|*<>':
278 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
280 if restricted and ord(char) > 127:
284 result = ''.join(map(replace_insane, s))
286 while '__' in result:
287 result = result.replace('__', '_')
288 result = result.strip('_')
289 # Common case of "Foreign band name - English song title"
290 if restricted and result.startswith('-_'):
297 def orderedSet(iterable):
298 """ Remove all duplicates from the input iterable """
306 def _htmlentity_transform(entity):
307 """Transforms an HTML entity to a character."""
308 # Known non-numeric HTML entity
309 if entity in compat_html_entities.name2codepoint:
310 return compat_chr(compat_html_entities.name2codepoint[entity])
312 mobj = re.match(r'#(x?[0-9]+)', entity)
314 numstr = mobj.group(1)
315 if numstr.startswith('x'):
317 numstr = '0%s' % numstr
320 return compat_chr(int(numstr, base))
322 # Unknown entity in name, return its literal representation
323 return ('&%s;' % entity)
329 assert type(s) == compat_str
332 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
335 def encodeFilename(s, for_subprocess=False):
337 @param s The name of the file
340 assert type(s) == compat_str
342 # Python 3 has a Unicode API
343 if sys.version_info >= (3, 0):
346 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
347 # Pass '' directly to use Unicode APIs on Windows 2000 and up
348 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
349 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
350 if not for_subprocess:
353 # For subprocess calls, encode with locale encoding
354 # Refer to http://stackoverflow.com/a/9951851/35070
355 encoding = preferredencoding()
357 encoding = sys.getfilesystemencoding()
360 return s.encode(encoding, 'ignore')
363 def encodeArgument(s):
364 if not isinstance(s, compat_str):
365 # Legacy code that uses byte strings
366 # Uncomment the following line after fixing all post processors
367 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
368 s = s.decode('ascii')
369 return encodeFilename(s, True)
372 def decodeOption(optval):
375 if isinstance(optval, bytes):
376 optval = optval.decode(preferredencoding())
378 assert isinstance(optval, compat_str)
382 def formatSeconds(secs):
384 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
386 return '%d:%02d' % (secs // 60, secs % 60)
391 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
392 if sys.version_info < (3, 2):
395 class HTTPSConnectionV3(httplib.HTTPSConnection):
396 def __init__(self, *args, **kwargs):
397 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
400 sock = socket.create_connection((self.host, self.port), self.timeout)
401 if getattr(self, '_tunnel_host', False):
405 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
407 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
409 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
410 def https_open(self, req):
411 return self.do_open(HTTPSConnectionV3, req)
412 return HTTPSHandlerV3(**kwargs)
413 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
414 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
415 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
416 if opts_no_check_certificate:
417 context.verify_mode = ssl.CERT_NONE
418 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
420 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
421 context.verify_mode = (ssl.CERT_NONE
422 if opts_no_check_certificate
423 else ssl.CERT_REQUIRED)
424 context.set_default_verify_paths()
426 context.load_default_certs()
427 except AttributeError:
429 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
432 class ExtractorError(Exception):
433 """Error during info extraction."""
435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
442 if video_id is not None:
443 msg = video_id + ': ' + msg
445 msg += ' (caused by %r)' % cause
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
454 super(ExtractorError, self).__init__(msg)
457 self.exc_info = sys.exc_info() # preserve original exception
459 self.video_id = video_id
461 def format_traceback(self):
462 if self.traceback is None:
464 return ''.join(traceback.format_tb(self.traceback))
467 class RegexNotFoundError(ExtractorError):
468 """Error when a regex didn't match"""
472 class DownloadError(Exception):
473 """Download Error exception.
475 This exception may be thrown by FileDownloader objects if they are not
476 configured to continue on errors. They will contain the appropriate
480 def __init__(self, msg, exc_info=None):
481 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
482 super(DownloadError, self).__init__(msg)
483 self.exc_info = exc_info
486 class SameFileError(Exception):
487 """Same File exception.
489 This exception will be thrown by FileDownloader objects if they detect
490 multiple files would have to be downloaded to the same file on disk.
495 class PostProcessingError(Exception):
496 """Post Processing exception.
498 This exception may be raised by PostProcessor's .run() method to
499 indicate an error in the postprocessing task.
502 def __init__(self, msg):
506 class MaxDownloadsReached(Exception):
507 """ --max-downloads limit has been reached. """
511 class UnavailableVideoError(Exception):
512 """Unavailable Format exception.
514 This exception will be thrown when a video is requested
515 in a format that is not available for that video.
520 class ContentTooShortError(Exception):
521 """Content Too Short exception.
523 This exception may be raised by FileDownloader objects when a file they
524 download is too small for what the server announced first, indicating
525 the connection was probably interrupted.
531 def __init__(self, downloaded, expected):
532 self.downloaded = downloaded
533 self.expected = expected
536 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
537 """Handler for HTTP requests and responses.
539 This class, when installed with an OpenerDirector, automatically adds
540 the standard headers to every HTTP request and handles gzipped and
541 deflated responses from web servers. If compression is to be avoided in
542 a particular request, the original request in the program code only has
543 to include the HTTP header "Youtubedl-No-Compression", which will be
544 removed before making the real request.
546 Part of this code was copied from:
548 http://techknack.net/python-urllib2-handlers/
550 Andrew Rowls, the author of that code, agreed to release it to the
557 return zlib.decompress(data, -zlib.MAX_WBITS)
559 return zlib.decompress(data)
562 def addinfourl_wrapper(stream, headers, url, code):
563 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
564 return compat_urllib_request.addinfourl(stream, headers, url, code)
565 ret = compat_urllib_request.addinfourl(stream, headers, url)
569 def http_request(self, req):
570 for h, v in std_headers.items():
571 if h not in req.headers:
573 if 'Youtubedl-no-compression' in req.headers:
574 if 'Accept-encoding' in req.headers:
575 del req.headers['Accept-encoding']
576 del req.headers['Youtubedl-no-compression']
577 if 'Youtubedl-user-agent' in req.headers:
578 if 'User-agent' in req.headers:
579 del req.headers['User-agent']
580 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
581 del req.headers['Youtubedl-user-agent']
583 if sys.version_info < (2, 7) and '#' in req.get_full_url():
584 # Python 2.6 is brain-dead when it comes to fragments
585 req._Request__original = req._Request__original.partition('#')[0]
586 req._Request__r_type = req._Request__r_type.partition('#')[0]
590 def http_response(self, req, resp):
593 if resp.headers.get('Content-encoding', '') == 'gzip':
594 content = resp.read()
595 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
597 uncompressed = io.BytesIO(gz.read())
598 except IOError as original_ioerror:
599 # There may be junk add the end of the file
600 # See http://stackoverflow.com/q/4928560/35070 for details
601 for i in range(1, 1024):
603 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
604 uncompressed = io.BytesIO(gz.read())
609 raise original_ioerror
610 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
611 resp.msg = old_resp.msg
613 if resp.headers.get('Content-encoding', '') == 'deflate':
614 gz = io.BytesIO(self.deflate(resp.read()))
615 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
616 resp.msg = old_resp.msg
619 https_request = http_request
620 https_response = http_response
623 def parse_iso8601(date_str, delimiter='T'):
624 """ Return a UNIX timestamp from the given date """
630 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
633 timezone = datetime.timedelta()
635 date_str = date_str[:-len(m.group(0))]
636 if not m.group('sign'):
637 timezone = datetime.timedelta()
639 sign = 1 if m.group('sign') == '+' else -1
640 timezone = datetime.timedelta(
641 hours=sign * int(m.group('hours')),
642 minutes=sign * int(m.group('minutes')))
643 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
644 dt = datetime.datetime.strptime(date_str, date_format) - timezone
645 return calendar.timegm(dt.timetuple())
648 def unified_strdate(date_str, day_first=True):
649 """Return a string with the date in the format YYYYMMDD"""
655 date_str = date_str.replace(',', ' ')
656 # %z (UTC offset) is only supported in python>=3.2
657 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
658 # Remove AM/PM + timezone
659 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
661 format_expressions = [
666 '%b %dst %Y %I:%M%p',
667 '%b %dnd %Y %I:%M%p',
668 '%b %dth %Y %I:%M%p',
676 '%Y-%m-%d %H:%M:%S.%f',
679 '%Y-%m-%dT%H:%M:%SZ',
680 '%Y-%m-%dT%H:%M:%S.%fZ',
681 '%Y-%m-%dT%H:%M:%S.%f0Z',
683 '%Y-%m-%dT%H:%M:%S.%f',
687 format_expressions.extend([
691 format_expressions.extend([
694 for expression in format_expressions:
696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
699 if upload_date is None:
700 timetuple = email.utils.parsedate_tz(date_str)
702 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
706 def determine_ext(url, default_ext='unknown_video'):
709 guess = url.partition('?')[0].rpartition('.')[2]
710 if re.match(r'^[A-Za-z0-9]+$', guess):
716 def subtitles_filename(filename, sub_lang, sub_format):
717 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
720 def date_from_str(date_str):
722 Return a datetime object from a string in the format YYYYMMDD or
723 (now|today)[+-][0-9](day|week|month|year)(s)?"""
724 today = datetime.date.today()
725 if date_str in ('now', 'today'):
727 if date_str == 'yesterday':
728 return today - datetime.timedelta(days=1)
729 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
730 if match is not None:
731 sign = match.group('sign')
732 time = int(match.group('time'))
735 unit = match.group('unit')
736 # A bad aproximation?
744 delta = datetime.timedelta(**{unit: time})
746 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
749 def hyphenate_date(date_str):
751 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
752 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
753 if match is not None:
754 return '-'.join(match.groups())
759 class DateRange(object):
760 """Represents a time interval between two dates"""
762 def __init__(self, start=None, end=None):
763 """start and end must be strings in the format accepted by date"""
764 if start is not None:
765 self.start = date_from_str(start)
767 self.start = datetime.datetime.min.date()
769 self.end = date_from_str(end)
771 self.end = datetime.datetime.max.date()
772 if self.start > self.end:
773 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
777 """Returns a range that only contains the given day"""
780 def __contains__(self, date):
781 """Check if the date is in the range"""
782 if not isinstance(date, datetime.date):
783 date = date_from_str(date)
784 return self.start <= date <= self.end
787 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
791 """ Returns the platform name as a compat_str """
792 res = platform.platform()
793 if isinstance(res, bytes):
794 res = res.decode(preferredencoding())
796 assert isinstance(res, compat_str)
800 def _windows_write_string(s, out):
801 """ Returns True if the string was written using special methods,
802 False if it has yet to be written out."""
803 # Adapted from http://stackoverflow.com/a/3259271/35070
806 import ctypes.wintypes
814 fileno = out.fileno()
815 except AttributeError:
816 # If the output stream doesn't have a fileno, it's virtual
818 if fileno not in WIN_OUTPUT_IDS:
821 GetStdHandle = compat_WINFUNCTYPE(
822 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
823 ("GetStdHandle", ctypes.windll.kernel32))
824 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
826 WriteConsoleW = compat_WINFUNCTYPE(
827 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
828 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
829 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
830 written = ctypes.wintypes.DWORD(0)
832 GetFileType = compat_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
833 FILE_TYPE_CHAR = 0x0002
834 FILE_TYPE_REMOTE = 0x8000
835 GetConsoleMode = compat_WINFUNCTYPE(
836 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
837 ctypes.POINTER(ctypes.wintypes.DWORD))(
838 ("GetConsoleMode", ctypes.windll.kernel32))
839 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
841 def not_a_console(handle):
842 if handle == INVALID_HANDLE_VALUE or handle is None:
844 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
845 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
850 def next_nonbmp_pos(s):
852 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
853 except StopIteration:
857 count = min(next_nonbmp_pos(s), 1024)
860 h, s, count if count else 2, ctypes.byref(written), None)
862 raise OSError('Failed to write string')
863 if not count: # We just wrote a non-BMP character
864 assert written.value == 2
867 assert written.value > 0
868 s = s[written.value:]
872 def write_string(s, out=None, encoding=None):
875 assert type(s) == compat_str
877 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
878 if _windows_write_string(s, out):
881 if ('b' in getattr(out, 'mode', '') or
882 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
883 byt = s.encode(encoding or preferredencoding(), 'ignore')
885 elif hasattr(out, 'buffer'):
886 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
887 byt = s.encode(enc, 'ignore')
888 out.buffer.write(byt)
894 def bytes_to_intlist(bs):
897 if isinstance(bs[0], int): # Python 3
900 return [ord(c) for c in bs]
903 def intlist_to_bytes(xs):
906 return struct_pack('%dB' % len(xs), *xs)
909 # Cross-platform file locking
910 if sys.platform == 'win32':
911 import ctypes.wintypes
914 class OVERLAPPED(ctypes.Structure):
916 ('Internal', ctypes.wintypes.LPVOID),
917 ('InternalHigh', ctypes.wintypes.LPVOID),
918 ('Offset', ctypes.wintypes.DWORD),
919 ('OffsetHigh', ctypes.wintypes.DWORD),
920 ('hEvent', ctypes.wintypes.HANDLE),
923 kernel32 = ctypes.windll.kernel32
924 LockFileEx = kernel32.LockFileEx
925 LockFileEx.argtypes = [
926 ctypes.wintypes.HANDLE, # hFile
927 ctypes.wintypes.DWORD, # dwFlags
928 ctypes.wintypes.DWORD, # dwReserved
929 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
930 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
931 ctypes.POINTER(OVERLAPPED) # Overlapped
933 LockFileEx.restype = ctypes.wintypes.BOOL
934 UnlockFileEx = kernel32.UnlockFileEx
935 UnlockFileEx.argtypes = [
936 ctypes.wintypes.HANDLE, # hFile
937 ctypes.wintypes.DWORD, # dwReserved
938 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
939 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
940 ctypes.POINTER(OVERLAPPED) # Overlapped
942 UnlockFileEx.restype = ctypes.wintypes.BOOL
943 whole_low = 0xffffffff
944 whole_high = 0x7fffffff
946 def _lock_file(f, exclusive):
947 overlapped = OVERLAPPED()
948 overlapped.Offset = 0
949 overlapped.OffsetHigh = 0
950 overlapped.hEvent = 0
951 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
952 handle = msvcrt.get_osfhandle(f.fileno())
953 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
954 whole_low, whole_high, f._lock_file_overlapped_p):
955 raise OSError('Locking file failed: %r' % ctypes.FormatError())
958 assert f._lock_file_overlapped_p
959 handle = msvcrt.get_osfhandle(f.fileno())
960 if not UnlockFileEx(handle, 0,
961 whole_low, whole_high, f._lock_file_overlapped_p):
962 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
967 def _lock_file(f, exclusive):
968 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
971 fcntl.flock(f, fcntl.LOCK_UN)
974 class locked_file(object):
975 def __init__(self, filename, mode, encoding=None):
976 assert mode in ['r', 'a', 'w']
977 self.f = io.open(filename, mode, encoding=encoding)
981 exclusive = self.mode != 'r'
983 _lock_file(self.f, exclusive)
989 def __exit__(self, etype, value, traceback):
998 def write(self, *args):
999 return self.f.write(*args)
1001 def read(self, *args):
1002 return self.f.read(*args)
1005 def get_filesystem_encoding():
1006 encoding = sys.getfilesystemencoding()
1007 return encoding if encoding is not None else 'utf-8'
1010 def shell_quote(args):
1012 encoding = get_filesystem_encoding()
1014 if isinstance(a, bytes):
1015 # We may get a filename encoded with 'encodeFilename'
1016 a = a.decode(encoding)
1017 quoted_args.append(pipes.quote(a))
1018 return ' '.join(quoted_args)
1021 def takewhile_inclusive(pred, seq):
1022 """ Like itertools.takewhile, but include the latest evaluated element
1023 (the first element so that Not pred(e)) """
1030 def smuggle_url(url, data):
1031 """ Pass additional data in a URL for internal use. """
1033 sdata = compat_urllib_parse.urlencode(
1034 {'__youtubedl_smuggle': json.dumps(data)})
1035 return url + '#' + sdata
1038 def unsmuggle_url(smug_url, default=None):
1039 if '#__youtubedl_smuggle' not in smug_url:
1040 return smug_url, default
1041 url, _, sdata = smug_url.rpartition('#')
1042 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1043 data = json.loads(jsond)
1047 def format_bytes(bytes):
1050 if type(bytes) is str:
1051 bytes = float(bytes)
1055 exponent = int(math.log(bytes, 1024.0))
1056 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1057 converted = float(bytes) / float(1024 ** exponent)
1058 return '%.2f%s' % (converted, suffix)
1061 def parse_filesize(s):
1065 # The lower-case forms are of course incorrect and inofficial,
1066 # but we support those too
1104 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1106 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1110 num_str = m.group('num').replace(',', '.')
1111 mult = _UNIT_TABLE[m.group('unit')]
1112 return int(float(num_str) * mult)
1115 def get_term_width():
1116 columns = compat_getenv('COLUMNS', None)
1121 sp = subprocess.Popen(
1123 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1124 out, err = sp.communicate()
1125 return int(out.split()[1])
1131 def month_by_name(name):
1132 """ Return the number of a month by (locale-independently) English name """
1135 'January', 'February', 'March', 'April', 'May', 'June',
1136 'July', 'August', 'September', 'October', 'November', 'December']
1138 return ENGLISH_NAMES.index(name) + 1
1143 def fix_xml_ampersands(xml_str):
1144 """Replace all the '&' by '&' in XML"""
1146 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1151 def setproctitle(title):
1152 assert isinstance(title, compat_str)
1154 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1157 title_bytes = title.encode('utf-8')
1158 buf = ctypes.create_string_buffer(len(title_bytes))
1159 buf.value = title_bytes
1161 libc.prctl(15, buf, 0, 0, 0)
1162 except AttributeError:
1163 return # Strange libc, just skip this
1166 def remove_start(s, start):
1167 if s.startswith(start):
1168 return s[len(start):]
1172 def remove_end(s, end):
1174 return s[:-len(end)]
1178 def url_basename(url):
1179 path = compat_urlparse.urlparse(url).path
1180 return path.strip('/').split('/')[-1]
1183 class HEADRequest(compat_urllib_request.Request):
1184 def get_method(self):
1188 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1191 v = getattr(v, get_attr, None)
1194 return default if v is None else (int(v) * invscale // scale)
1197 def str_or_none(v, default=None):
1198 return default if v is None else compat_str(v)
1201 def str_to_int(int_str):
1202 """ A more relaxed version of int_or_none """
1205 int_str = re.sub(r'[,\.\+]', '', int_str)
1209 def float_or_none(v, scale=1, invscale=1, default=None):
1210 return default if v is None else (float(v) * invscale / scale)
1213 def parse_duration(s):
1222 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1223 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1226 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1227 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1229 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1234 if m.group('only_mins'):
1235 return float_or_none(m.group('only_mins'), invscale=60)
1236 if m.group('only_hours'):
1237 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1239 res += int(m.group('secs'))
1241 res += int(m.group('mins')) * 60
1242 if m.group('hours'):
1243 res += int(m.group('hours')) * 60 * 60
1245 res += float(m.group('ms'))
1249 def prepend_extension(filename, ext):
1250 name, real_ext = os.path.splitext(filename)
1251 return '{0}.{1}{2}'.format(name, ext, real_ext)
1254 def check_executable(exe, args=[]):
1255 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1256 args can be a list of arguments for a short output (like -version) """
1258 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1264 def get_exe_version(exe, args=['--version'],
1265 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1266 unrecognized='present'):
1267 """ Returns the version of the specified executable,
1268 or False if the executable is not present """
1270 out, err = subprocess.Popen(
1272 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1275 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1276 m = re.search(version_re, firstline)
1283 class PagedList(object):
1285 # This is only useful for tests
1286 return len(self.getslice())
1289 class OnDemandPagedList(PagedList):
1290 def __init__(self, pagefunc, pagesize):
1291 self._pagefunc = pagefunc
1292 self._pagesize = pagesize
1294 def getslice(self, start=0, end=None):
1296 for pagenum in itertools.count(start // self._pagesize):
1297 firstid = pagenum * self._pagesize
1298 nextfirstid = pagenum * self._pagesize + self._pagesize
1299 if start >= nextfirstid:
1302 page_results = list(self._pagefunc(pagenum))
1305 start % self._pagesize
1306 if firstid <= start < nextfirstid
1310 ((end - 1) % self._pagesize) + 1
1311 if (end is not None and firstid <= end <= nextfirstid)
1314 if startv != 0 or endv is not None:
1315 page_results = page_results[startv:endv]
1316 res.extend(page_results)
1318 # A little optimization - if current page is not "full", ie. does
1319 # not contain page_size videos then we can assume that this page
1320 # is the last one - there are no more ids on further pages -
1321 # i.e. no need to query again.
1322 if len(page_results) + startv < self._pagesize:
1325 # If we got the whole page, but the next page is not interesting,
1326 # break out early as well
1327 if end == nextfirstid:
1332 class InAdvancePagedList(PagedList):
1333 def __init__(self, pagefunc, pagecount, pagesize):
1334 self._pagefunc = pagefunc
1335 self._pagecount = pagecount
1336 self._pagesize = pagesize
1338 def getslice(self, start=0, end=None):
1340 start_page = start // self._pagesize
1342 self._pagecount if end is None else (end // self._pagesize + 1))
1343 skip_elems = start - start_page * self._pagesize
1344 only_more = None if end is None else end - start
1345 for pagenum in range(start_page, end_page):
1346 page = list(self._pagefunc(pagenum))
1348 page = page[skip_elems:]
1350 if only_more is not None:
1351 if len(page) < only_more:
1352 only_more -= len(page)
1354 page = page[:only_more]
1361 def uppercase_escape(s):
1362 unicode_escape = codecs.getdecoder('unicode_escape')
1364 r'\\U[0-9a-fA-F]{8}',
1365 lambda m: unicode_escape(m.group(0))[0],
1369 def escape_rfc3986(s):
1370 """Escape non-ASCII characters as suggested by RFC 3986"""
1371 if sys.version_info < (3, 0) and isinstance(s, unicode):
1372 s = s.encode('utf-8')
1373 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1376 def escape_url(url):
1377 """Escape URL as suggested by RFC 3986"""
1378 url_parsed = compat_urllib_parse_urlparse(url)
1379 return url_parsed._replace(
1380 path=escape_rfc3986(url_parsed.path),
1381 params=escape_rfc3986(url_parsed.params),
1382 query=escape_rfc3986(url_parsed.query),
1383 fragment=escape_rfc3986(url_parsed.fragment)
1387 struct.pack('!I', 0)
1389 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1390 def struct_pack(spec, *args):
1391 if isinstance(spec, compat_str):
1392 spec = spec.encode('ascii')
1393 return struct.pack(spec, *args)
1395 def struct_unpack(spec, *args):
1396 if isinstance(spec, compat_str):
1397 spec = spec.encode('ascii')
1398 return struct.unpack(spec, *args)
1400 struct_pack = struct.pack
1401 struct_unpack = struct.unpack
1404 def read_batch_urls(batch_fd):
1406 if not isinstance(url, compat_str):
1407 url = url.decode('utf-8', 'replace')
1408 BOM_UTF8 = '\xef\xbb\xbf'
1409 if url.startswith(BOM_UTF8):
1410 url = url[len(BOM_UTF8):]
1412 if url.startswith(('#', ';', ']')):
1416 with contextlib.closing(batch_fd) as fd:
1417 return [url for url in map(fixup, fd) if url]
1420 def urlencode_postdata(*args, **kargs):
1421 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1425 etree_iter = xml.etree.ElementTree.Element.iter
1426 except AttributeError: # Python <=2.6
1427 etree_iter = lambda n: n.findall('.//*')
1431 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1432 def doctype(self, name, pubid, system):
1433 pass # Ignore doctypes
1435 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1436 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1437 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1438 # Fix up XML parser in Python 2.x
1439 if sys.version_info < (3, 0):
1440 for n in etree_iter(tree):
1441 if n.text is not None:
1442 if not isinstance(n.text, compat_str):
1443 n.text = n.text.decode('utf-8')
1456 def parse_age_limit(s):
1459 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1460 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1463 def strip_jsonp(code):
1465 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1468 def js_to_json(code):
1471 if v in ('true', 'false', 'null'):
1473 if v.startswith('"'):
1475 if v.startswith("'"):
1477 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1484 res = re.sub(r'''(?x)
1485 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1486 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1487 [a-zA-Z_][a-zA-Z_0-9]*
1489 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1493 def qualities(quality_ids):
1494 """ Get a numeric quality value out of a list of possible values """
1497 return quality_ids.index(qid)
1503 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1506 def limit_length(s, length):
1507 """ Add ellipses to overly long strings """
1512 return s[:length - len(ELLIPSES)] + ELLIPSES
1516 def version_tuple(v):
1517 return tuple(int(e) for e in re.split(r'[-.]', v))
1520 def is_outdated_version(version, limit, assume_new=True):
1522 return not assume_new
1524 return version_tuple(version) < version_tuple(limit)
1526 return not assume_new
1529 def ytdl_is_updateable():
1530 """ Returns if youtube-dl can be updated with -U """
1531 from zipimport import zipimporter
1533 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1536 def args_to_str(args):
1537 # Get a short string representation for a subprocess command
1538 return ' '.join(shlex_quote(a) for a in args)