2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref = locale.getpreferredencoding()
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
108 tf = tempfile.NamedTemporaryFile(**args)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os.rename(tf.name, fn)
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
157 replaced.append(c[0])
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
169 if n is None or n.text is None:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m = re.search(r'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re.escape(attribute), re.escape(value)), html)
198 res = m.group('content')
200 if res.startswith('"') or res.startswith("'"):
203 return unescapeHTML(res)
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
209 html = html.replace('\n', ' ')
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
219 def sanitize_open(filename, open_mode):
220 """Try to open the given filename, and slightly tweak it if this fails.
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
227 It returns the tuple (stream, definitive_file_name).
231 if sys.platform == 'win32':
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
238 if err.errno in (errno.EACCES,):
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
246 if alt_filename == filename:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
254 def timeconvert(timestr):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
263 def sanitize_filename(s, restricted=False, is_id=False):
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
272 return '' if restricted else '\''
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
279 if restricted and ord(char) > 127:
283 result = ''.join(map(replace_insane, s))
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
296 def orderedSet(iterable):
297 """ Remove all duplicates from the input iterable """
305 def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
311 mobj = re.match(r'#(x?[0-9]+)', entity)
313 numstr = mobj.group(1)
314 if numstr.startswith('x'):
316 numstr = '0%s' % numstr
319 return compat_chr(int(numstr, base))
321 # Unknown entity in name, return its literal representation
322 return ('&%s;' % entity)
328 assert type(s) == compat_str
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
334 def encodeFilename(s, for_subprocess=False):
336 @param s The name of the file
339 assert type(s) == compat_str
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
349 if not for_subprocess:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
356 encoding = sys.getfilesystemencoding()
359 return s.encode(encoding, 'ignore')
362 def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
371 def decodeOption(optval):
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
377 assert isinstance(optval, compat_str)
381 def formatSeconds(secs):
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
385 return '%d:%02d' % (secs // 60, secs % 60)
390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
391 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
392 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
393 if opts_no_check_certificate:
394 context.verify_mode = ssl.CERT_NONE
396 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
399 # (create_default_context present but HTTPSHandler has no context=)
402 if sys.version_info < (3, 2):
405 class HTTPSConnectionV3(httplib.HTTPSConnection):
406 def __init__(self, *args, **kwargs):
407 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
410 sock = socket.create_connection((self.host, self.port), self.timeout)
411 if getattr(self, '_tunnel_host', False):
415 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
417 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
419 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
420 def https_open(self, req):
421 return self.do_open(HTTPSConnectionV3, req)
422 return HTTPSHandlerV3(**kwargs)
424 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
425 context.verify_mode = (ssl.CERT_NONE
426 if opts_no_check_certificate
427 else ssl.CERT_REQUIRED)
428 context.set_default_verify_paths()
429 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
432 class ExtractorError(Exception):
433 """Error during info extraction."""
435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
442 if video_id is not None:
443 msg = video_id + ': ' + msg
445 msg += ' (caused by %r)' % cause
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
454 super(ExtractorError, self).__init__(msg)
457 self.exc_info = sys.exc_info() # preserve original exception
459 self.video_id = video_id
461 def format_traceback(self):
462 if self.traceback is None:
464 return ''.join(traceback.format_tb(self.traceback))
467 class UnsupportedError(ExtractorError):
468 def __init__(self, url):
469 super(UnsupportedError, self).__init__(
470 'Unsupported URL: %s' % url, expected=True)
474 class RegexNotFoundError(ExtractorError):
475 """Error when a regex didn't match"""
479 class DownloadError(Exception):
480 """Download Error exception.
482 This exception may be thrown by FileDownloader objects if they are not
483 configured to continue on errors. They will contain the appropriate
487 def __init__(self, msg, exc_info=None):
488 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
489 super(DownloadError, self).__init__(msg)
490 self.exc_info = exc_info
493 class SameFileError(Exception):
494 """Same File exception.
496 This exception will be thrown by FileDownloader objects if they detect
497 multiple files would have to be downloaded to the same file on disk.
502 class PostProcessingError(Exception):
503 """Post Processing exception.
505 This exception may be raised by PostProcessor's .run() method to
506 indicate an error in the postprocessing task.
509 def __init__(self, msg):
513 class MaxDownloadsReached(Exception):
514 """ --max-downloads limit has been reached. """
518 class UnavailableVideoError(Exception):
519 """Unavailable Format exception.
521 This exception will be thrown when a video is requested
522 in a format that is not available for that video.
527 class ContentTooShortError(Exception):
528 """Content Too Short exception.
530 This exception may be raised by FileDownloader objects when a file they
531 download is too small for what the server announced first, indicating
532 the connection was probably interrupted.
538 def __init__(self, downloaded, expected):
539 self.downloaded = downloaded
540 self.expected = expected
543 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
544 """Handler for HTTP requests and responses.
546 This class, when installed with an OpenerDirector, automatically adds
547 the standard headers to every HTTP request and handles gzipped and
548 deflated responses from web servers. If compression is to be avoided in
549 a particular request, the original request in the program code only has
550 to include the HTTP header "Youtubedl-No-Compression", which will be
551 removed before making the real request.
553 Part of this code was copied from:
555 http://techknack.net/python-urllib2-handlers/
557 Andrew Rowls, the author of that code, agreed to release it to the
564 return zlib.decompress(data, -zlib.MAX_WBITS)
566 return zlib.decompress(data)
569 def addinfourl_wrapper(stream, headers, url, code):
570 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
571 return compat_urllib_request.addinfourl(stream, headers, url, code)
572 ret = compat_urllib_request.addinfourl(stream, headers, url)
576 def http_request(self, req):
577 for h, v in std_headers.items():
578 if h not in req.headers:
580 if 'Youtubedl-no-compression' in req.headers:
581 if 'Accept-encoding' in req.headers:
582 del req.headers['Accept-encoding']
583 del req.headers['Youtubedl-no-compression']
584 if 'Youtubedl-user-agent' in req.headers:
585 if 'User-agent' in req.headers:
586 del req.headers['User-agent']
587 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
588 del req.headers['Youtubedl-user-agent']
590 if sys.version_info < (2, 7) and '#' in req.get_full_url():
591 # Python 2.6 is brain-dead when it comes to fragments
592 req._Request__original = req._Request__original.partition('#')[0]
593 req._Request__r_type = req._Request__r_type.partition('#')[0]
597 def http_response(self, req, resp):
600 if resp.headers.get('Content-encoding', '') == 'gzip':
601 content = resp.read()
602 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
604 uncompressed = io.BytesIO(gz.read())
605 except IOError as original_ioerror:
606 # There may be junk add the end of the file
607 # See http://stackoverflow.com/q/4928560/35070 for details
608 for i in range(1, 1024):
610 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
611 uncompressed = io.BytesIO(gz.read())
616 raise original_ioerror
617 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
618 resp.msg = old_resp.msg
620 if resp.headers.get('Content-encoding', '') == 'deflate':
621 gz = io.BytesIO(self.deflate(resp.read()))
622 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
623 resp.msg = old_resp.msg
626 https_request = http_request
627 https_response = http_response
630 def parse_iso8601(date_str, delimiter='T'):
631 """ Return a UNIX timestamp from the given date """
637 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
640 timezone = datetime.timedelta()
642 date_str = date_str[:-len(m.group(0))]
643 if not m.group('sign'):
644 timezone = datetime.timedelta()
646 sign = 1 if m.group('sign') == '+' else -1
647 timezone = datetime.timedelta(
648 hours=sign * int(m.group('hours')),
649 minutes=sign * int(m.group('minutes')))
650 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
651 dt = datetime.datetime.strptime(date_str, date_format) - timezone
652 return calendar.timegm(dt.timetuple())
655 def unified_strdate(date_str, day_first=True):
656 """Return a string with the date in the format YYYYMMDD"""
662 date_str = date_str.replace(',', ' ')
663 # %z (UTC offset) is only supported in python>=3.2
664 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
665 # Remove AM/PM + timezone
666 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
668 format_expressions = [
673 '%b %dst %Y %I:%M%p',
674 '%b %dnd %Y %I:%M%p',
675 '%b %dth %Y %I:%M%p',
683 '%Y-%m-%d %H:%M:%S.%f',
686 '%Y-%m-%dT%H:%M:%SZ',
687 '%Y-%m-%dT%H:%M:%S.%fZ',
688 '%Y-%m-%dT%H:%M:%S.%f0Z',
690 '%Y-%m-%dT%H:%M:%S.%f',
694 format_expressions.extend([
698 format_expressions.extend([
701 for expression in format_expressions:
703 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
706 if upload_date is None:
707 timetuple = email.utils.parsedate_tz(date_str)
709 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
713 def determine_ext(url, default_ext='unknown_video'):
716 guess = url.partition('?')[0].rpartition('.')[2]
717 if re.match(r'^[A-Za-z0-9]+$', guess):
723 def subtitles_filename(filename, sub_lang, sub_format):
724 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
727 def date_from_str(date_str):
729 Return a datetime object from a string in the format YYYYMMDD or
730 (now|today)[+-][0-9](day|week|month|year)(s)?"""
731 today = datetime.date.today()
732 if date_str in ('now', 'today'):
734 if date_str == 'yesterday':
735 return today - datetime.timedelta(days=1)
736 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
737 if match is not None:
738 sign = match.group('sign')
739 time = int(match.group('time'))
742 unit = match.group('unit')
743 # A bad aproximation?
751 delta = datetime.timedelta(**{unit: time})
753 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
756 def hyphenate_date(date_str):
758 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
759 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
760 if match is not None:
761 return '-'.join(match.groups())
766 class DateRange(object):
767 """Represents a time interval between two dates"""
769 def __init__(self, start=None, end=None):
770 """start and end must be strings in the format accepted by date"""
771 if start is not None:
772 self.start = date_from_str(start)
774 self.start = datetime.datetime.min.date()
776 self.end = date_from_str(end)
778 self.end = datetime.datetime.max.date()
779 if self.start > self.end:
780 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
784 """Returns a range that only contains the given day"""
787 def __contains__(self, date):
788 """Check if the date is in the range"""
789 if not isinstance(date, datetime.date):
790 date = date_from_str(date)
791 return self.start <= date <= self.end
794 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
798 """ Returns the platform name as a compat_str """
799 res = platform.platform()
800 if isinstance(res, bytes):
801 res = res.decode(preferredencoding())
803 assert isinstance(res, compat_str)
807 def _windows_write_string(s, out):
808 """ Returns True if the string was written using special methods,
809 False if it has yet to be written out."""
810 # Adapted from http://stackoverflow.com/a/3259271/35070
813 import ctypes.wintypes
821 fileno = out.fileno()
822 except AttributeError:
823 # If the output stream doesn't have a fileno, it's virtual
825 if fileno not in WIN_OUTPUT_IDS:
828 GetStdHandle = ctypes.WINFUNCTYPE(
829 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
830 (b"GetStdHandle", ctypes.windll.kernel32))
831 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
833 WriteConsoleW = ctypes.WINFUNCTYPE(
834 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
835 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
836 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
837 written = ctypes.wintypes.DWORD(0)
839 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
840 FILE_TYPE_CHAR = 0x0002
841 FILE_TYPE_REMOTE = 0x8000
842 GetConsoleMode = ctypes.WINFUNCTYPE(
843 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
844 ctypes.POINTER(ctypes.wintypes.DWORD))(
845 (b"GetConsoleMode", ctypes.windll.kernel32))
846 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
848 def not_a_console(handle):
849 if handle == INVALID_HANDLE_VALUE or handle is None:
851 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
852 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
857 def next_nonbmp_pos(s):
859 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
860 except StopIteration:
864 count = min(next_nonbmp_pos(s), 1024)
867 h, s, count if count else 2, ctypes.byref(written), None)
869 raise OSError('Failed to write string')
870 if not count: # We just wrote a non-BMP character
871 assert written.value == 2
874 assert written.value > 0
875 s = s[written.value:]
879 def write_string(s, out=None, encoding=None):
882 assert type(s) == compat_str
884 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
885 if _windows_write_string(s, out):
888 if ('b' in getattr(out, 'mode', '') or
889 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
890 byt = s.encode(encoding or preferredencoding(), 'ignore')
892 elif hasattr(out, 'buffer'):
893 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
894 byt = s.encode(enc, 'ignore')
895 out.buffer.write(byt)
901 def bytes_to_intlist(bs):
904 if isinstance(bs[0], int): # Python 3
907 return [ord(c) for c in bs]
910 def intlist_to_bytes(xs):
913 return struct_pack('%dB' % len(xs), *xs)
916 # Cross-platform file locking
917 if sys.platform == 'win32':
918 import ctypes.wintypes
921 class OVERLAPPED(ctypes.Structure):
923 ('Internal', ctypes.wintypes.LPVOID),
924 ('InternalHigh', ctypes.wintypes.LPVOID),
925 ('Offset', ctypes.wintypes.DWORD),
926 ('OffsetHigh', ctypes.wintypes.DWORD),
927 ('hEvent', ctypes.wintypes.HANDLE),
930 kernel32 = ctypes.windll.kernel32
931 LockFileEx = kernel32.LockFileEx
932 LockFileEx.argtypes = [
933 ctypes.wintypes.HANDLE, # hFile
934 ctypes.wintypes.DWORD, # dwFlags
935 ctypes.wintypes.DWORD, # dwReserved
936 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
937 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
938 ctypes.POINTER(OVERLAPPED) # Overlapped
940 LockFileEx.restype = ctypes.wintypes.BOOL
941 UnlockFileEx = kernel32.UnlockFileEx
942 UnlockFileEx.argtypes = [
943 ctypes.wintypes.HANDLE, # hFile
944 ctypes.wintypes.DWORD, # dwReserved
945 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
946 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
947 ctypes.POINTER(OVERLAPPED) # Overlapped
949 UnlockFileEx.restype = ctypes.wintypes.BOOL
950 whole_low = 0xffffffff
951 whole_high = 0x7fffffff
953 def _lock_file(f, exclusive):
954 overlapped = OVERLAPPED()
955 overlapped.Offset = 0
956 overlapped.OffsetHigh = 0
957 overlapped.hEvent = 0
958 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
959 handle = msvcrt.get_osfhandle(f.fileno())
960 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
961 whole_low, whole_high, f._lock_file_overlapped_p):
962 raise OSError('Locking file failed: %r' % ctypes.FormatError())
965 assert f._lock_file_overlapped_p
966 handle = msvcrt.get_osfhandle(f.fileno())
967 if not UnlockFileEx(handle, 0,
968 whole_low, whole_high, f._lock_file_overlapped_p):
969 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
974 def _lock_file(f, exclusive):
975 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
978 fcntl.flock(f, fcntl.LOCK_UN)
981 class locked_file(object):
982 def __init__(self, filename, mode, encoding=None):
983 assert mode in ['r', 'a', 'w']
984 self.f = io.open(filename, mode, encoding=encoding)
988 exclusive = self.mode != 'r'
990 _lock_file(self.f, exclusive)
996 def __exit__(self, etype, value, traceback):
1005 def write(self, *args):
1006 return self.f.write(*args)
1008 def read(self, *args):
1009 return self.f.read(*args)
1012 def get_filesystem_encoding():
1013 encoding = sys.getfilesystemencoding()
1014 return encoding if encoding is not None else 'utf-8'
1017 def shell_quote(args):
1019 encoding = get_filesystem_encoding()
1021 if isinstance(a, bytes):
1022 # We may get a filename encoded with 'encodeFilename'
1023 a = a.decode(encoding)
1024 quoted_args.append(pipes.quote(a))
1025 return ' '.join(quoted_args)
1028 def takewhile_inclusive(pred, seq):
1029 """ Like itertools.takewhile, but include the latest evaluated element
1030 (the first element so that Not pred(e)) """
1037 def smuggle_url(url, data):
1038 """ Pass additional data in a URL for internal use. """
1040 sdata = compat_urllib_parse.urlencode(
1041 {'__youtubedl_smuggle': json.dumps(data)})
1042 return url + '#' + sdata
1045 def unsmuggle_url(smug_url, default=None):
1046 if '#__youtubedl_smuggle' not in smug_url:
1047 return smug_url, default
1048 url, _, sdata = smug_url.rpartition('#')
1049 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1050 data = json.loads(jsond)
1054 def format_bytes(bytes):
1057 if type(bytes) is str:
1058 bytes = float(bytes)
1062 exponent = int(math.log(bytes, 1024.0))
1063 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1064 converted = float(bytes) / float(1024 ** exponent)
1065 return '%.2f%s' % (converted, suffix)
1068 def parse_filesize(s):
1072 # The lower-case forms are of course incorrect and inofficial,
1073 # but we support those too
1111 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1113 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1117 num_str = m.group('num').replace(',', '.')
1118 mult = _UNIT_TABLE[m.group('unit')]
1119 return int(float(num_str) * mult)
1122 def get_term_width():
1123 columns = compat_getenv('COLUMNS', None)
1128 sp = subprocess.Popen(
1130 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1131 out, err = sp.communicate()
1132 return int(out.split()[1])
1138 def month_by_name(name):
1139 """ Return the number of a month by (locale-independently) English name """
1142 'January', 'February', 'March', 'April', 'May', 'June',
1143 'July', 'August', 'September', 'October', 'November', 'December']
1145 return ENGLISH_NAMES.index(name) + 1
1150 def fix_xml_ampersands(xml_str):
1151 """Replace all the '&' by '&' in XML"""
1153 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1158 def setproctitle(title):
1159 assert isinstance(title, compat_str)
1161 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1164 title_bytes = title.encode('utf-8')
1165 buf = ctypes.create_string_buffer(len(title_bytes))
1166 buf.value = title_bytes
1168 libc.prctl(15, buf, 0, 0, 0)
1169 except AttributeError:
1170 return # Strange libc, just skip this
1173 def remove_start(s, start):
1174 if s.startswith(start):
1175 return s[len(start):]
1179 def remove_end(s, end):
1181 return s[:-len(end)]
1185 def url_basename(url):
1186 path = compat_urlparse.urlparse(url).path
1187 return path.strip('/').split('/')[-1]
1190 class HEADRequest(compat_urllib_request.Request):
1191 def get_method(self):
1195 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1198 v = getattr(v, get_attr, None)
1201 return default if v is None else (int(v) * invscale // scale)
1204 def str_or_none(v, default=None):
1205 return default if v is None else compat_str(v)
1208 def str_to_int(int_str):
1209 """ A more relaxed version of int_or_none """
1212 int_str = re.sub(r'[,\.\+]', '', int_str)
1216 def float_or_none(v, scale=1, invscale=1, default=None):
1217 return default if v is None else (float(v) * invscale / scale)
1220 def parse_duration(s):
1229 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1230 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1233 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1234 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1236 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1241 if m.group('only_mins'):
1242 return float_or_none(m.group('only_mins'), invscale=60)
1243 if m.group('only_hours'):
1244 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1246 res += int(m.group('secs'))
1248 res += int(m.group('mins')) * 60
1249 if m.group('hours'):
1250 res += int(m.group('hours')) * 60 * 60
1252 res += float(m.group('ms'))
1256 def prepend_extension(filename, ext):
1257 name, real_ext = os.path.splitext(filename)
1258 return '{0}.{1}{2}'.format(name, ext, real_ext)
1261 def check_executable(exe, args=[]):
1262 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1263 args can be a list of arguments for a short output (like -version) """
1265 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1271 def get_exe_version(exe, args=['--version'],
1272 version_re=None, unrecognized='present'):
1273 """ Returns the version of the specified executable,
1274 or False if the executable is not present """
1276 out, _ = subprocess.Popen(
1278 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1281 if isinstance(out, bytes): # Python 2.x
1282 out = out.decode('ascii', 'ignore')
1283 return detect_exe_version(out, version_re, unrecognized)
1286 def detect_exe_version(output, version_re=None, unrecognized='present'):
1287 assert isinstance(output, compat_str)
1288 if version_re is None:
1289 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1290 m = re.search(version_re, output)
1297 class PagedList(object):
1299 # This is only useful for tests
1300 return len(self.getslice())
1303 class OnDemandPagedList(PagedList):
1304 def __init__(self, pagefunc, pagesize):
1305 self._pagefunc = pagefunc
1306 self._pagesize = pagesize
1308 def getslice(self, start=0, end=None):
1310 for pagenum in itertools.count(start // self._pagesize):
1311 firstid = pagenum * self._pagesize
1312 nextfirstid = pagenum * self._pagesize + self._pagesize
1313 if start >= nextfirstid:
1316 page_results = list(self._pagefunc(pagenum))
1319 start % self._pagesize
1320 if firstid <= start < nextfirstid
1324 ((end - 1) % self._pagesize) + 1
1325 if (end is not None and firstid <= end <= nextfirstid)
1328 if startv != 0 or endv is not None:
1329 page_results = page_results[startv:endv]
1330 res.extend(page_results)
1332 # A little optimization - if current page is not "full", ie. does
1333 # not contain page_size videos then we can assume that this page
1334 # is the last one - there are no more ids on further pages -
1335 # i.e. no need to query again.
1336 if len(page_results) + startv < self._pagesize:
1339 # If we got the whole page, but the next page is not interesting,
1340 # break out early as well
1341 if end == nextfirstid:
1346 class InAdvancePagedList(PagedList):
1347 def __init__(self, pagefunc, pagecount, pagesize):
1348 self._pagefunc = pagefunc
1349 self._pagecount = pagecount
1350 self._pagesize = pagesize
1352 def getslice(self, start=0, end=None):
1354 start_page = start // self._pagesize
1356 self._pagecount if end is None else (end // self._pagesize + 1))
1357 skip_elems = start - start_page * self._pagesize
1358 only_more = None if end is None else end - start
1359 for pagenum in range(start_page, end_page):
1360 page = list(self._pagefunc(pagenum))
1362 page = page[skip_elems:]
1364 if only_more is not None:
1365 if len(page) < only_more:
1366 only_more -= len(page)
1368 page = page[:only_more]
1375 def uppercase_escape(s):
1376 unicode_escape = codecs.getdecoder('unicode_escape')
1378 r'\\U[0-9a-fA-F]{8}',
1379 lambda m: unicode_escape(m.group(0))[0],
1383 def escape_rfc3986(s):
1384 """Escape non-ASCII characters as suggested by RFC 3986"""
1385 if sys.version_info < (3, 0) and isinstance(s, unicode):
1386 s = s.encode('utf-8')
1387 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1390 def escape_url(url):
1391 """Escape URL as suggested by RFC 3986"""
1392 url_parsed = compat_urllib_parse_urlparse(url)
1393 return url_parsed._replace(
1394 path=escape_rfc3986(url_parsed.path),
1395 params=escape_rfc3986(url_parsed.params),
1396 query=escape_rfc3986(url_parsed.query),
1397 fragment=escape_rfc3986(url_parsed.fragment)
1401 struct.pack('!I', 0)
1403 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1404 def struct_pack(spec, *args):
1405 if isinstance(spec, compat_str):
1406 spec = spec.encode('ascii')
1407 return struct.pack(spec, *args)
1409 def struct_unpack(spec, *args):
1410 if isinstance(spec, compat_str):
1411 spec = spec.encode('ascii')
1412 return struct.unpack(spec, *args)
1414 struct_pack = struct.pack
1415 struct_unpack = struct.unpack
1418 def read_batch_urls(batch_fd):
1420 if not isinstance(url, compat_str):
1421 url = url.decode('utf-8', 'replace')
1422 BOM_UTF8 = '\xef\xbb\xbf'
1423 if url.startswith(BOM_UTF8):
1424 url = url[len(BOM_UTF8):]
1426 if url.startswith(('#', ';', ']')):
1430 with contextlib.closing(batch_fd) as fd:
1431 return [url for url in map(fixup, fd) if url]
1434 def urlencode_postdata(*args, **kargs):
1435 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1439 etree_iter = xml.etree.ElementTree.Element.iter
1440 except AttributeError: # Python <=2.6
1441 etree_iter = lambda n: n.findall('.//*')
1445 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1446 def doctype(self, name, pubid, system):
1447 pass # Ignore doctypes
1449 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1450 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1451 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1452 # Fix up XML parser in Python 2.x
1453 if sys.version_info < (3, 0):
1454 for n in etree_iter(tree):
1455 if n.text is not None:
1456 if not isinstance(n.text, compat_str):
1457 n.text = n.text.decode('utf-8')
1470 def parse_age_limit(s):
1473 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1474 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1477 def strip_jsonp(code):
1479 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1482 def js_to_json(code):
1485 if v in ('true', 'false', 'null'):
1487 if v.startswith('"'):
1489 if v.startswith("'"):
1491 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1498 res = re.sub(r'''(?x)
1499 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1500 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1501 [a-zA-Z_][a-zA-Z_0-9]*
1503 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1507 def qualities(quality_ids):
1508 """ Get a numeric quality value out of a list of possible values """
1511 return quality_ids.index(qid)
1517 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1520 def limit_length(s, length):
1521 """ Add ellipses to overly long strings """
1526 return s[:length - len(ELLIPSES)] + ELLIPSES
1530 def version_tuple(v):
1531 return tuple(int(e) for e in re.split(r'[-.]', v))
1534 def is_outdated_version(version, limit, assume_new=True):
1536 return not assume_new
1538 return version_tuple(version) < version_tuple(limit)
1540 return not assume_new
1543 def ytdl_is_updateable():
1544 """ Returns if youtube-dl can be updated with -U """
1545 from zipimport import zipimporter
1547 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1550 def args_to_str(args):
1551 # Get a short string representation for a subprocess command
1552 return ' '.join(shlex_quote(a) for a in args)
1555 def urlhandle_detect_ext(url_handle):
1558 getheader = lambda h: url_handle.headers[h]
1559 except AttributeError: # Python < 3
1560 getheader = url_handle.info().getheader
1562 return getheader('Content-Type').split("/")[1]