2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref = locale.getpreferredencoding()
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
108 tf = tempfile.NamedTemporaryFile(**args)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os.rename(tf.name, fn)
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
157 replaced.append(c[0])
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
169 if n is None or n.text is None:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m = re.search(r'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re.escape(attribute), re.escape(value)), html)
198 res = m.group('content')
200 if res.startswith('"') or res.startswith("'"):
203 return unescapeHTML(res)
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
209 html = html.replace('\n', ' ')
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
219 def sanitize_open(filename, open_mode):
220 """Try to open the given filename, and slightly tweak it if this fails.
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
227 It returns the tuple (stream, definitive_file_name).
231 if sys.platform == 'win32':
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
238 if err.errno in (errno.EACCES,):
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
246 if alt_filename == filename:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
254 def timeconvert(timestr):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
263 def sanitize_filename(s, restricted=False, is_id=False):
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
272 return '' if restricted else '\''
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
279 if restricted and ord(char) > 127:
283 result = ''.join(map(replace_insane, s))
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
296 def orderedSet(iterable):
297 """ Remove all duplicates from the input iterable """
305 def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
311 mobj = re.match(r'#(x?[0-9]+)', entity)
313 numstr = mobj.group(1)
314 if numstr.startswith('x'):
316 numstr = '0%s' % numstr
319 return compat_chr(int(numstr, base))
321 # Unknown entity in name, return its literal representation
322 return ('&%s;' % entity)
328 assert type(s) == compat_str
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
334 def encodeFilename(s, for_subprocess=False):
336 @param s The name of the file
339 assert type(s) == compat_str
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
349 if not for_subprocess:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
356 encoding = sys.getfilesystemencoding()
359 return s.encode(encoding, 'ignore')
362 def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
371 def decodeOption(optval):
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
377 assert isinstance(optval, compat_str)
381 def formatSeconds(secs):
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
385 return '%d:%02d' % (secs // 60, secs % 60)
390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
391 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
392 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
393 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
394 if opts_no_check_certificate:
395 context.verify_mode = ssl.CERT_NONE
396 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
397 elif sys.version_info < (3, 2):
400 class HTTPSConnectionV3(httplib.HTTPSConnection):
401 def __init__(self, *args, **kwargs):
402 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
405 sock = socket.create_connection((self.host, self.port), self.timeout)
406 if getattr(self, '_tunnel_host', False):
410 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
412 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
414 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
415 def https_open(self, req):
416 return self.do_open(HTTPSConnectionV3, req)
417 return HTTPSHandlerV3(**kwargs)
419 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
420 context.verify_mode = (ssl.CERT_NONE
421 if opts_no_check_certificate
422 else ssl.CERT_REQUIRED)
423 context.set_default_verify_paths()
424 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
427 class ExtractorError(Exception):
428 """Error during info extraction."""
430 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
431 """ tb, if given, is the original traceback (so that it can be printed out).
432 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
435 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
437 if video_id is not None:
438 msg = video_id + ': ' + msg
440 msg += ' (caused by %r)' % cause
442 if ytdl_is_updateable():
443 update_cmd = 'type youtube-dl -U to update'
445 update_cmd = 'see https://yt-dl.org/update on how to update'
446 msg += '; please report this issue on https://yt-dl.org/bug .'
447 msg += ' Make sure you are using the latest version; %s.' % update_cmd
448 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
449 super(ExtractorError, self).__init__(msg)
452 self.exc_info = sys.exc_info() # preserve original exception
454 self.video_id = video_id
456 def format_traceback(self):
457 if self.traceback is None:
459 return ''.join(traceback.format_tb(self.traceback))
462 class RegexNotFoundError(ExtractorError):
463 """Error when a regex didn't match"""
467 class DownloadError(Exception):
468 """Download Error exception.
470 This exception may be thrown by FileDownloader objects if they are not
471 configured to continue on errors. They will contain the appropriate
475 def __init__(self, msg, exc_info=None):
476 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
477 super(DownloadError, self).__init__(msg)
478 self.exc_info = exc_info
481 class SameFileError(Exception):
482 """Same File exception.
484 This exception will be thrown by FileDownloader objects if they detect
485 multiple files would have to be downloaded to the same file on disk.
490 class PostProcessingError(Exception):
491 """Post Processing exception.
493 This exception may be raised by PostProcessor's .run() method to
494 indicate an error in the postprocessing task.
497 def __init__(self, msg):
501 class MaxDownloadsReached(Exception):
502 """ --max-downloads limit has been reached. """
506 class UnavailableVideoError(Exception):
507 """Unavailable Format exception.
509 This exception will be thrown when a video is requested
510 in a format that is not available for that video.
515 class ContentTooShortError(Exception):
516 """Content Too Short exception.
518 This exception may be raised by FileDownloader objects when a file they
519 download is too small for what the server announced first, indicating
520 the connection was probably interrupted.
526 def __init__(self, downloaded, expected):
527 self.downloaded = downloaded
528 self.expected = expected
531 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
532 """Handler for HTTP requests and responses.
534 This class, when installed with an OpenerDirector, automatically adds
535 the standard headers to every HTTP request and handles gzipped and
536 deflated responses from web servers. If compression is to be avoided in
537 a particular request, the original request in the program code only has
538 to include the HTTP header "Youtubedl-No-Compression", which will be
539 removed before making the real request.
541 Part of this code was copied from:
543 http://techknack.net/python-urllib2-handlers/
545 Andrew Rowls, the author of that code, agreed to release it to the
552 return zlib.decompress(data, -zlib.MAX_WBITS)
554 return zlib.decompress(data)
557 def addinfourl_wrapper(stream, headers, url, code):
558 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
559 return compat_urllib_request.addinfourl(stream, headers, url, code)
560 ret = compat_urllib_request.addinfourl(stream, headers, url)
564 def http_request(self, req):
565 for h, v in std_headers.items():
566 if h not in req.headers:
568 if 'Youtubedl-no-compression' in req.headers:
569 if 'Accept-encoding' in req.headers:
570 del req.headers['Accept-encoding']
571 del req.headers['Youtubedl-no-compression']
572 if 'Youtubedl-user-agent' in req.headers:
573 if 'User-agent' in req.headers:
574 del req.headers['User-agent']
575 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
576 del req.headers['Youtubedl-user-agent']
578 if sys.version_info < (2, 7) and '#' in req.get_full_url():
579 # Python 2.6 is brain-dead when it comes to fragments
580 req._Request__original = req._Request__original.partition('#')[0]
581 req._Request__r_type = req._Request__r_type.partition('#')[0]
585 def http_response(self, req, resp):
588 if resp.headers.get('Content-encoding', '') == 'gzip':
589 content = resp.read()
590 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
592 uncompressed = io.BytesIO(gz.read())
593 except IOError as original_ioerror:
594 # There may be junk add the end of the file
595 # See http://stackoverflow.com/q/4928560/35070 for details
596 for i in range(1, 1024):
598 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
599 uncompressed = io.BytesIO(gz.read())
604 raise original_ioerror
605 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
606 resp.msg = old_resp.msg
608 if resp.headers.get('Content-encoding', '') == 'deflate':
609 gz = io.BytesIO(self.deflate(resp.read()))
610 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
611 resp.msg = old_resp.msg
614 https_request = http_request
615 https_response = http_response
618 def parse_iso8601(date_str, delimiter='T'):
619 """ Return a UNIX timestamp from the given date """
625 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
628 timezone = datetime.timedelta()
630 date_str = date_str[:-len(m.group(0))]
631 if not m.group('sign'):
632 timezone = datetime.timedelta()
634 sign = 1 if m.group('sign') == '+' else -1
635 timezone = datetime.timedelta(
636 hours=sign * int(m.group('hours')),
637 minutes=sign * int(m.group('minutes')))
638 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
639 dt = datetime.datetime.strptime(date_str, date_format) - timezone
640 return calendar.timegm(dt.timetuple())
643 def unified_strdate(date_str, day_first=True):
644 """Return a string with the date in the format YYYYMMDD"""
650 date_str = date_str.replace(',', ' ')
651 # %z (UTC offset) is only supported in python>=3.2
652 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
653 # Remove AM/PM + timezone
654 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
656 format_expressions = [
661 '%b %dst %Y %I:%M%p',
662 '%b %dnd %Y %I:%M%p',
663 '%b %dth %Y %I:%M%p',
671 '%Y-%m-%d %H:%M:%S.%f',
674 '%Y-%m-%dT%H:%M:%SZ',
675 '%Y-%m-%dT%H:%M:%S.%fZ',
676 '%Y-%m-%dT%H:%M:%S.%f0Z',
678 '%Y-%m-%dT%H:%M:%S.%f',
682 format_expressions.extend([
686 format_expressions.extend([
689 for expression in format_expressions:
691 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
694 if upload_date is None:
695 timetuple = email.utils.parsedate_tz(date_str)
697 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
701 def determine_ext(url, default_ext='unknown_video'):
704 guess = url.partition('?')[0].rpartition('.')[2]
705 if re.match(r'^[A-Za-z0-9]+$', guess):
711 def subtitles_filename(filename, sub_lang, sub_format):
712 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
715 def date_from_str(date_str):
717 Return a datetime object from a string in the format YYYYMMDD or
718 (now|today)[+-][0-9](day|week|month|year)(s)?"""
719 today = datetime.date.today()
720 if date_str in ('now', 'today'):
722 if date_str == 'yesterday':
723 return today - datetime.timedelta(days=1)
724 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
725 if match is not None:
726 sign = match.group('sign')
727 time = int(match.group('time'))
730 unit = match.group('unit')
731 # A bad aproximation?
739 delta = datetime.timedelta(**{unit: time})
741 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
744 def hyphenate_date(date_str):
746 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
747 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
748 if match is not None:
749 return '-'.join(match.groups())
754 class DateRange(object):
755 """Represents a time interval between two dates"""
757 def __init__(self, start=None, end=None):
758 """start and end must be strings in the format accepted by date"""
759 if start is not None:
760 self.start = date_from_str(start)
762 self.start = datetime.datetime.min.date()
764 self.end = date_from_str(end)
766 self.end = datetime.datetime.max.date()
767 if self.start > self.end:
768 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
772 """Returns a range that only contains the given day"""
775 def __contains__(self, date):
776 """Check if the date is in the range"""
777 if not isinstance(date, datetime.date):
778 date = date_from_str(date)
779 return self.start <= date <= self.end
782 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
786 """ Returns the platform name as a compat_str """
787 res = platform.platform()
788 if isinstance(res, bytes):
789 res = res.decode(preferredencoding())
791 assert isinstance(res, compat_str)
795 def _windows_write_string(s, out):
796 """ Returns True if the string was written using special methods,
797 False if it has yet to be written out."""
798 # Adapted from http://stackoverflow.com/a/3259271/35070
801 import ctypes.wintypes
809 fileno = out.fileno()
810 except AttributeError:
811 # If the output stream doesn't have a fileno, it's virtual
813 if fileno not in WIN_OUTPUT_IDS:
816 GetStdHandle = ctypes.WINFUNCTYPE(
817 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
818 (b"GetStdHandle", ctypes.windll.kernel32))
819 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
821 WriteConsoleW = ctypes.WINFUNCTYPE(
822 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
823 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
824 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
825 written = ctypes.wintypes.DWORD(0)
827 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
828 FILE_TYPE_CHAR = 0x0002
829 FILE_TYPE_REMOTE = 0x8000
830 GetConsoleMode = ctypes.WINFUNCTYPE(
831 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
832 ctypes.POINTER(ctypes.wintypes.DWORD))(
833 (b"GetConsoleMode", ctypes.windll.kernel32))
834 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
836 def not_a_console(handle):
837 if handle == INVALID_HANDLE_VALUE or handle is None:
839 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
840 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
845 def next_nonbmp_pos(s):
847 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
848 except StopIteration:
852 count = min(next_nonbmp_pos(s), 1024)
855 h, s, count if count else 2, ctypes.byref(written), None)
857 raise OSError('Failed to write string')
858 if not count: # We just wrote a non-BMP character
859 assert written.value == 2
862 assert written.value > 0
863 s = s[written.value:]
867 def write_string(s, out=None, encoding=None):
870 assert type(s) == compat_str
872 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
873 if _windows_write_string(s, out):
876 if ('b' in getattr(out, 'mode', '') or
877 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
878 byt = s.encode(encoding or preferredencoding(), 'ignore')
880 elif hasattr(out, 'buffer'):
881 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
882 byt = s.encode(enc, 'ignore')
883 out.buffer.write(byt)
889 def bytes_to_intlist(bs):
892 if isinstance(bs[0], int): # Python 3
895 return [ord(c) for c in bs]
898 def intlist_to_bytes(xs):
901 return struct_pack('%dB' % len(xs), *xs)
904 # Cross-platform file locking
905 if sys.platform == 'win32':
906 import ctypes.wintypes
909 class OVERLAPPED(ctypes.Structure):
911 ('Internal', ctypes.wintypes.LPVOID),
912 ('InternalHigh', ctypes.wintypes.LPVOID),
913 ('Offset', ctypes.wintypes.DWORD),
914 ('OffsetHigh', ctypes.wintypes.DWORD),
915 ('hEvent', ctypes.wintypes.HANDLE),
918 kernel32 = ctypes.windll.kernel32
919 LockFileEx = kernel32.LockFileEx
920 LockFileEx.argtypes = [
921 ctypes.wintypes.HANDLE, # hFile
922 ctypes.wintypes.DWORD, # dwFlags
923 ctypes.wintypes.DWORD, # dwReserved
924 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
925 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
926 ctypes.POINTER(OVERLAPPED) # Overlapped
928 LockFileEx.restype = ctypes.wintypes.BOOL
929 UnlockFileEx = kernel32.UnlockFileEx
930 UnlockFileEx.argtypes = [
931 ctypes.wintypes.HANDLE, # hFile
932 ctypes.wintypes.DWORD, # dwReserved
933 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
934 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
935 ctypes.POINTER(OVERLAPPED) # Overlapped
937 UnlockFileEx.restype = ctypes.wintypes.BOOL
938 whole_low = 0xffffffff
939 whole_high = 0x7fffffff
941 def _lock_file(f, exclusive):
942 overlapped = OVERLAPPED()
943 overlapped.Offset = 0
944 overlapped.OffsetHigh = 0
945 overlapped.hEvent = 0
946 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
947 handle = msvcrt.get_osfhandle(f.fileno())
948 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
949 whole_low, whole_high, f._lock_file_overlapped_p):
950 raise OSError('Locking file failed: %r' % ctypes.FormatError())
953 assert f._lock_file_overlapped_p
954 handle = msvcrt.get_osfhandle(f.fileno())
955 if not UnlockFileEx(handle, 0,
956 whole_low, whole_high, f._lock_file_overlapped_p):
957 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
962 def _lock_file(f, exclusive):
963 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
966 fcntl.flock(f, fcntl.LOCK_UN)
969 class locked_file(object):
970 def __init__(self, filename, mode, encoding=None):
971 assert mode in ['r', 'a', 'w']
972 self.f = io.open(filename, mode, encoding=encoding)
976 exclusive = self.mode != 'r'
978 _lock_file(self.f, exclusive)
984 def __exit__(self, etype, value, traceback):
993 def write(self, *args):
994 return self.f.write(*args)
996 def read(self, *args):
997 return self.f.read(*args)
1000 def get_filesystem_encoding():
1001 encoding = sys.getfilesystemencoding()
1002 return encoding if encoding is not None else 'utf-8'
1005 def shell_quote(args):
1007 encoding = get_filesystem_encoding()
1009 if isinstance(a, bytes):
1010 # We may get a filename encoded with 'encodeFilename'
1011 a = a.decode(encoding)
1012 quoted_args.append(pipes.quote(a))
1013 return ' '.join(quoted_args)
1016 def takewhile_inclusive(pred, seq):
1017 """ Like itertools.takewhile, but include the latest evaluated element
1018 (the first element so that Not pred(e)) """
1025 def smuggle_url(url, data):
1026 """ Pass additional data in a URL for internal use. """
1028 sdata = compat_urllib_parse.urlencode(
1029 {'__youtubedl_smuggle': json.dumps(data)})
1030 return url + '#' + sdata
1033 def unsmuggle_url(smug_url, default=None):
1034 if '#__youtubedl_smuggle' not in smug_url:
1035 return smug_url, default
1036 url, _, sdata = smug_url.rpartition('#')
1037 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1038 data = json.loads(jsond)
1042 def format_bytes(bytes):
1045 if type(bytes) is str:
1046 bytes = float(bytes)
1050 exponent = int(math.log(bytes, 1024.0))
1051 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1052 converted = float(bytes) / float(1024 ** exponent)
1053 return '%.2f%s' % (converted, suffix)
1056 def parse_filesize(s):
1060 # The lower-case forms are of course incorrect and inofficial,
1061 # but we support those too
1099 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1101 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1105 num_str = m.group('num').replace(',', '.')
1106 mult = _UNIT_TABLE[m.group('unit')]
1107 return int(float(num_str) * mult)
1110 def get_term_width():
1111 columns = compat_getenv('COLUMNS', None)
1116 sp = subprocess.Popen(
1118 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1119 out, err = sp.communicate()
1120 return int(out.split()[1])
1126 def month_by_name(name):
1127 """ Return the number of a month by (locale-independently) English name """
1130 'January', 'February', 'March', 'April', 'May', 'June',
1131 'July', 'August', 'September', 'October', 'November', 'December']
1133 return ENGLISH_NAMES.index(name) + 1
1138 def fix_xml_ampersands(xml_str):
1139 """Replace all the '&' by '&' in XML"""
1141 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1146 def setproctitle(title):
1147 assert isinstance(title, compat_str)
1149 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1152 title_bytes = title.encode('utf-8')
1153 buf = ctypes.create_string_buffer(len(title_bytes))
1154 buf.value = title_bytes
1156 libc.prctl(15, buf, 0, 0, 0)
1157 except AttributeError:
1158 return # Strange libc, just skip this
1161 def remove_start(s, start):
1162 if s.startswith(start):
1163 return s[len(start):]
1167 def remove_end(s, end):
1169 return s[:-len(end)]
1173 def url_basename(url):
1174 path = compat_urlparse.urlparse(url).path
1175 return path.strip('/').split('/')[-1]
1178 class HEADRequest(compat_urllib_request.Request):
1179 def get_method(self):
1183 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1186 v = getattr(v, get_attr, None)
1189 return default if v is None else (int(v) * invscale // scale)
1192 def str_or_none(v, default=None):
1193 return default if v is None else compat_str(v)
1196 def str_to_int(int_str):
1197 """ A more relaxed version of int_or_none """
1200 int_str = re.sub(r'[,\.\+]', '', int_str)
1204 def float_or_none(v, scale=1, invscale=1, default=None):
1205 return default if v is None else (float(v) * invscale / scale)
1208 def parse_duration(s):
1217 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1218 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1221 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1222 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1224 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1229 if m.group('only_mins'):
1230 return float_or_none(m.group('only_mins'), invscale=60)
1231 if m.group('only_hours'):
1232 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1234 res += int(m.group('secs'))
1236 res += int(m.group('mins')) * 60
1237 if m.group('hours'):
1238 res += int(m.group('hours')) * 60 * 60
1240 res += float(m.group('ms'))
1244 def prepend_extension(filename, ext):
1245 name, real_ext = os.path.splitext(filename)
1246 return '{0}.{1}{2}'.format(name, ext, real_ext)
1249 def check_executable(exe, args=[]):
1250 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1251 args can be a list of arguments for a short output (like -version) """
1253 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1259 def get_exe_version(exe, args=['--version'],
1260 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1261 unrecognized='present'):
1262 """ Returns the version of the specified executable,
1263 or False if the executable is not present """
1265 out, err = subprocess.Popen(
1267 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1270 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1271 m = re.search(version_re, firstline)
1278 class PagedList(object):
1280 # This is only useful for tests
1281 return len(self.getslice())
1284 class OnDemandPagedList(PagedList):
1285 def __init__(self, pagefunc, pagesize):
1286 self._pagefunc = pagefunc
1287 self._pagesize = pagesize
1289 def getslice(self, start=0, end=None):
1291 for pagenum in itertools.count(start // self._pagesize):
1292 firstid = pagenum * self._pagesize
1293 nextfirstid = pagenum * self._pagesize + self._pagesize
1294 if start >= nextfirstid:
1297 page_results = list(self._pagefunc(pagenum))
1300 start % self._pagesize
1301 if firstid <= start < nextfirstid
1305 ((end - 1) % self._pagesize) + 1
1306 if (end is not None and firstid <= end <= nextfirstid)
1309 if startv != 0 or endv is not None:
1310 page_results = page_results[startv:endv]
1311 res.extend(page_results)
1313 # A little optimization - if current page is not "full", ie. does
1314 # not contain page_size videos then we can assume that this page
1315 # is the last one - there are no more ids on further pages -
1316 # i.e. no need to query again.
1317 if len(page_results) + startv < self._pagesize:
1320 # If we got the whole page, but the next page is not interesting,
1321 # break out early as well
1322 if end == nextfirstid:
1327 class InAdvancePagedList(PagedList):
1328 def __init__(self, pagefunc, pagecount, pagesize):
1329 self._pagefunc = pagefunc
1330 self._pagecount = pagecount
1331 self._pagesize = pagesize
1333 def getslice(self, start=0, end=None):
1335 start_page = start // self._pagesize
1337 self._pagecount if end is None else (end // self._pagesize + 1))
1338 skip_elems = start - start_page * self._pagesize
1339 only_more = None if end is None else end - start
1340 for pagenum in range(start_page, end_page):
1341 page = list(self._pagefunc(pagenum))
1343 page = page[skip_elems:]
1345 if only_more is not None:
1346 if len(page) < only_more:
1347 only_more -= len(page)
1349 page = page[:only_more]
1356 def uppercase_escape(s):
1357 unicode_escape = codecs.getdecoder('unicode_escape')
1359 r'\\U[0-9a-fA-F]{8}',
1360 lambda m: unicode_escape(m.group(0))[0],
1364 def escape_rfc3986(s):
1365 """Escape non-ASCII characters as suggested by RFC 3986"""
1366 if sys.version_info < (3, 0) and isinstance(s, unicode):
1367 s = s.encode('utf-8')
1368 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1371 def escape_url(url):
1372 """Escape URL as suggested by RFC 3986"""
1373 url_parsed = compat_urllib_parse_urlparse(url)
1374 return url_parsed._replace(
1375 path=escape_rfc3986(url_parsed.path),
1376 params=escape_rfc3986(url_parsed.params),
1377 query=escape_rfc3986(url_parsed.query),
1378 fragment=escape_rfc3986(url_parsed.fragment)
1382 struct.pack('!I', 0)
1384 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1385 def struct_pack(spec, *args):
1386 if isinstance(spec, compat_str):
1387 spec = spec.encode('ascii')
1388 return struct.pack(spec, *args)
1390 def struct_unpack(spec, *args):
1391 if isinstance(spec, compat_str):
1392 spec = spec.encode('ascii')
1393 return struct.unpack(spec, *args)
1395 struct_pack = struct.pack
1396 struct_unpack = struct.unpack
1399 def read_batch_urls(batch_fd):
1401 if not isinstance(url, compat_str):
1402 url = url.decode('utf-8', 'replace')
1403 BOM_UTF8 = '\xef\xbb\xbf'
1404 if url.startswith(BOM_UTF8):
1405 url = url[len(BOM_UTF8):]
1407 if url.startswith(('#', ';', ']')):
1411 with contextlib.closing(batch_fd) as fd:
1412 return [url for url in map(fixup, fd) if url]
1415 def urlencode_postdata(*args, **kargs):
1416 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1420 etree_iter = xml.etree.ElementTree.Element.iter
1421 except AttributeError: # Python <=2.6
1422 etree_iter = lambda n: n.findall('.//*')
1426 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1427 def doctype(self, name, pubid, system):
1428 pass # Ignore doctypes
1430 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1431 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1432 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1433 # Fix up XML parser in Python 2.x
1434 if sys.version_info < (3, 0):
1435 for n in etree_iter(tree):
1436 if n.text is not None:
1437 if not isinstance(n.text, compat_str):
1438 n.text = n.text.decode('utf-8')
1451 def parse_age_limit(s):
1454 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1455 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1458 def strip_jsonp(code):
1460 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1463 def js_to_json(code):
1466 if v in ('true', 'false', 'null'):
1468 if v.startswith('"'):
1470 if v.startswith("'"):
1472 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1479 res = re.sub(r'''(?x)
1480 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1481 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1482 [a-zA-Z_][a-zA-Z_0-9]*
1484 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1488 def qualities(quality_ids):
1489 """ Get a numeric quality value out of a list of possible values """
1492 return quality_ids.index(qid)
1498 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1501 def limit_length(s, length):
1502 """ Add ellipses to overly long strings """
1507 return s[:length - len(ELLIPSES)] + ELLIPSES
1511 def version_tuple(v):
1512 return tuple(int(e) for e in re.split(r'[-.]', v))
1515 def is_outdated_version(version, limit, assume_new=True):
1517 return not assume_new
1519 return version_tuple(version) < version_tuple(limit)
1521 return not assume_new
1524 def ytdl_is_updateable():
1525 """ Returns if youtube-dl can be updated with -U """
1526 from zipimport import zipimporter
1528 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1531 def args_to_str(args):
1532 # Get a short string representation for a subprocess command
1533 return ' '.join(shlex_quote(a) for a in args)