2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref = locale.getpreferredencoding()
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
108 tf = tempfile.NamedTemporaryFile(**args)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os.rename(tf.name, fn)
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
157 replaced.append(c[0])
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
169 if n is None or n.text is None:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m = re.search(r'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re.escape(attribute), re.escape(value)), html)
198 res = m.group('content')
200 if res.startswith('"') or res.startswith("'"):
203 return unescapeHTML(res)
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
209 html = html.replace('\n', ' ')
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
219 def sanitize_open(filename, open_mode):
220 """Try to open the given filename, and slightly tweak it if this fails.
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
227 It returns the tuple (stream, definitive_file_name).
231 if sys.platform == 'win32':
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
238 if err.errno in (errno.EACCES,):
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
246 if alt_filename == filename:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
254 def timeconvert(timestr):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
263 def sanitize_filename(s, restricted=False, is_id=False):
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
272 return '' if restricted else '\''
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
279 if restricted and ord(char) > 127:
283 result = ''.join(map(replace_insane, s))
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
296 def orderedSet(iterable):
297 """ Remove all duplicates from the input iterable """
305 def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
311 mobj = re.match(r'#(x?[0-9]+)', entity)
313 numstr = mobj.group(1)
314 if numstr.startswith('x'):
316 numstr = '0%s' % numstr
319 return compat_chr(int(numstr, base))
321 # Unknown entity in name, return its literal representation
322 return ('&%s;' % entity)
328 assert type(s) == compat_str
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
334 def encodeFilename(s, for_subprocess=False):
336 @param s The name of the file
339 assert type(s) == compat_str
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
349 if not for_subprocess:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
356 encoding = sys.getfilesystemencoding()
359 return s.encode(encoding, 'ignore')
362 def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
371 def decodeOption(optval):
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
377 assert isinstance(optval, compat_str)
381 def formatSeconds(secs):
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
385 return '%d:%02d' % (secs // 60, secs % 60)
390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
391 if sys.version_info < (3, 2):
394 class HTTPSConnectionV3(httplib.HTTPSConnection):
395 def __init__(self, *args, **kwargs):
396 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
399 sock = socket.create_connection((self.host, self.port), self.timeout)
400 if getattr(self, '_tunnel_host', False):
404 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
406 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
408 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
409 def https_open(self, req):
410 return self.do_open(HTTPSConnectionV3, req)
411 return HTTPSHandlerV3(**kwargs)
412 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
413 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
414 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
415 if opts_no_check_certificate:
416 context.verify_mode = ssl.CERT_NONE
417 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
419 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
420 context.verify_mode = (ssl.CERT_NONE
421 if opts_no_check_certificate
422 else ssl.CERT_REQUIRED)
423 context.set_default_verify_paths()
425 context.load_default_certs()
426 except AttributeError:
428 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
431 class ExtractorError(Exception):
432 """Error during info extraction."""
434 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
435 """ tb, if given, is the original traceback (so that it can be printed out).
436 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
439 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
441 if video_id is not None:
442 msg = video_id + ': ' + msg
444 msg += ' (caused by %r)' % cause
446 if ytdl_is_updateable():
447 update_cmd = 'type youtube-dl -U to update'
449 update_cmd = 'see https://yt-dl.org/update on how to update'
450 msg += '; please report this issue on https://yt-dl.org/bug .'
451 msg += ' Make sure you are using the latest version; %s.' % update_cmd
452 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
453 super(ExtractorError, self).__init__(msg)
456 self.exc_info = sys.exc_info() # preserve original exception
458 self.video_id = video_id
460 def format_traceback(self):
461 if self.traceback is None:
463 return ''.join(traceback.format_tb(self.traceback))
466 class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
471 class DownloadError(Exception):
472 """Download Error exception.
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
485 class SameFileError(Exception):
486 """Same File exception.
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
494 class PostProcessingError(Exception):
495 """Post Processing exception.
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
501 def __init__(self, msg):
505 class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
510 class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
519 class ContentTooShortError(Exception):
520 """Content Too Short exception.
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
535 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
536 """Handler for HTTP requests and responses.
538 This class, when installed with an OpenerDirector, automatically adds
539 the standard headers to every HTTP request and handles gzipped and
540 deflated responses from web servers. If compression is to be avoided in
541 a particular request, the original request in the program code only has
542 to include the HTTP header "Youtubedl-No-Compression", which will be
543 removed before making the real request.
545 Part of this code was copied from:
547 http://techknack.net/python-urllib2-handlers/
549 Andrew Rowls, the author of that code, agreed to release it to the
556 return zlib.decompress(data, -zlib.MAX_WBITS)
558 return zlib.decompress(data)
561 def addinfourl_wrapper(stream, headers, url, code):
562 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
563 return compat_urllib_request.addinfourl(stream, headers, url, code)
564 ret = compat_urllib_request.addinfourl(stream, headers, url)
568 def http_request(self, req):
569 for h, v in std_headers.items():
570 if h not in req.headers:
572 if 'Youtubedl-no-compression' in req.headers:
573 if 'Accept-encoding' in req.headers:
574 del req.headers['Accept-encoding']
575 del req.headers['Youtubedl-no-compression']
576 if 'Youtubedl-user-agent' in req.headers:
577 if 'User-agent' in req.headers:
578 del req.headers['User-agent']
579 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
580 del req.headers['Youtubedl-user-agent']
582 if sys.version_info < (2, 7) and '#' in req.get_full_url():
583 # Python 2.6 is brain-dead when it comes to fragments
584 req._Request__original = req._Request__original.partition('#')[0]
585 req._Request__r_type = req._Request__r_type.partition('#')[0]
589 def http_response(self, req, resp):
592 if resp.headers.get('Content-encoding', '') == 'gzip':
593 content = resp.read()
594 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
596 uncompressed = io.BytesIO(gz.read())
597 except IOError as original_ioerror:
598 # There may be junk add the end of the file
599 # See http://stackoverflow.com/q/4928560/35070 for details
600 for i in range(1, 1024):
602 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
603 uncompressed = io.BytesIO(gz.read())
608 raise original_ioerror
609 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
610 resp.msg = old_resp.msg
612 if resp.headers.get('Content-encoding', '') == 'deflate':
613 gz = io.BytesIO(self.deflate(resp.read()))
614 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
615 resp.msg = old_resp.msg
618 https_request = http_request
619 https_response = http_response
622 def parse_iso8601(date_str, delimiter='T'):
623 """ Return a UNIX timestamp from the given date """
629 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
632 timezone = datetime.timedelta()
634 date_str = date_str[:-len(m.group(0))]
635 if not m.group('sign'):
636 timezone = datetime.timedelta()
638 sign = 1 if m.group('sign') == '+' else -1
639 timezone = datetime.timedelta(
640 hours=sign * int(m.group('hours')),
641 minutes=sign * int(m.group('minutes')))
642 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
643 dt = datetime.datetime.strptime(date_str, date_format) - timezone
644 return calendar.timegm(dt.timetuple())
647 def unified_strdate(date_str, day_first=True):
648 """Return a string with the date in the format YYYYMMDD"""
654 date_str = date_str.replace(',', ' ')
655 # %z (UTC offset) is only supported in python>=3.2
656 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
657 # Remove AM/PM + timezone
658 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
660 format_expressions = [
665 '%b %dst %Y %I:%M%p',
666 '%b %dnd %Y %I:%M%p',
667 '%b %dth %Y %I:%M%p',
675 '%Y-%m-%d %H:%M:%S.%f',
678 '%Y-%m-%dT%H:%M:%SZ',
679 '%Y-%m-%dT%H:%M:%S.%fZ',
680 '%Y-%m-%dT%H:%M:%S.%f0Z',
682 '%Y-%m-%dT%H:%M:%S.%f',
686 format_expressions.extend([
690 format_expressions.extend([
693 for expression in format_expressions:
695 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
698 if upload_date is None:
699 timetuple = email.utils.parsedate_tz(date_str)
701 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
705 def determine_ext(url, default_ext='unknown_video'):
708 guess = url.partition('?')[0].rpartition('.')[2]
709 if re.match(r'^[A-Za-z0-9]+$', guess):
715 def subtitles_filename(filename, sub_lang, sub_format):
716 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
719 def date_from_str(date_str):
721 Return a datetime object from a string in the format YYYYMMDD or
722 (now|today)[+-][0-9](day|week|month|year)(s)?"""
723 today = datetime.date.today()
724 if date_str in ('now', 'today'):
726 if date_str == 'yesterday':
727 return today - datetime.timedelta(days=1)
728 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
729 if match is not None:
730 sign = match.group('sign')
731 time = int(match.group('time'))
734 unit = match.group('unit')
735 # A bad aproximation?
743 delta = datetime.timedelta(**{unit: time})
745 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
748 def hyphenate_date(date_str):
750 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
751 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
752 if match is not None:
753 return '-'.join(match.groups())
758 class DateRange(object):
759 """Represents a time interval between two dates"""
761 def __init__(self, start=None, end=None):
762 """start and end must be strings in the format accepted by date"""
763 if start is not None:
764 self.start = date_from_str(start)
766 self.start = datetime.datetime.min.date()
768 self.end = date_from_str(end)
770 self.end = datetime.datetime.max.date()
771 if self.start > self.end:
772 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
776 """Returns a range that only contains the given day"""
779 def __contains__(self, date):
780 """Check if the date is in the range"""
781 if not isinstance(date, datetime.date):
782 date = date_from_str(date)
783 return self.start <= date <= self.end
786 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
790 """ Returns the platform name as a compat_str """
791 res = platform.platform()
792 if isinstance(res, bytes):
793 res = res.decode(preferredencoding())
795 assert isinstance(res, compat_str)
799 def _windows_write_string(s, out):
800 """ Returns True if the string was written using special methods,
801 False if it has yet to be written out."""
802 # Adapted from http://stackoverflow.com/a/3259271/35070
805 import ctypes.wintypes
813 fileno = out.fileno()
814 except AttributeError:
815 # If the output stream doesn't have a fileno, it's virtual
817 if fileno not in WIN_OUTPUT_IDS:
820 GetStdHandle = ctypes.WINFUNCTYPE(
821 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
822 ("GetStdHandle", ctypes.windll.kernel32))
823 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
825 WriteConsoleW = ctypes.WINFUNCTYPE(
826 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
827 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
828 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
829 written = ctypes.wintypes.DWORD(0)
831 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
832 FILE_TYPE_CHAR = 0x0002
833 FILE_TYPE_REMOTE = 0x8000
834 GetConsoleMode = ctypes.WINFUNCTYPE(
835 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
836 ctypes.POINTER(ctypes.wintypes.DWORD))(
837 ("GetConsoleMode", ctypes.windll.kernel32))
838 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
840 def not_a_console(handle):
841 if handle == INVALID_HANDLE_VALUE or handle is None:
843 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
844 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
849 def next_nonbmp_pos(s):
851 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
852 except StopIteration:
856 count = min(next_nonbmp_pos(s), 1024)
859 h, s, count if count else 2, ctypes.byref(written), None)
861 raise OSError('Failed to write string')
862 if not count: # We just wrote a non-BMP character
863 assert written.value == 2
866 assert written.value > 0
867 s = s[written.value:]
871 def write_string(s, out=None, encoding=None):
874 assert type(s) == compat_str
876 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
877 if _windows_write_string(s, out):
880 if ('b' in getattr(out, 'mode', '') or
881 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
882 byt = s.encode(encoding or preferredencoding(), 'ignore')
884 elif hasattr(out, 'buffer'):
885 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
886 byt = s.encode(enc, 'ignore')
887 out.buffer.write(byt)
893 def bytes_to_intlist(bs):
896 if isinstance(bs[0], int): # Python 3
899 return [ord(c) for c in bs]
902 def intlist_to_bytes(xs):
905 return struct_pack('%dB' % len(xs), *xs)
908 # Cross-platform file locking
909 if sys.platform == 'win32':
910 import ctypes.wintypes
913 class OVERLAPPED(ctypes.Structure):
915 ('Internal', ctypes.wintypes.LPVOID),
916 ('InternalHigh', ctypes.wintypes.LPVOID),
917 ('Offset', ctypes.wintypes.DWORD),
918 ('OffsetHigh', ctypes.wintypes.DWORD),
919 ('hEvent', ctypes.wintypes.HANDLE),
922 kernel32 = ctypes.windll.kernel32
923 LockFileEx = kernel32.LockFileEx
924 LockFileEx.argtypes = [
925 ctypes.wintypes.HANDLE, # hFile
926 ctypes.wintypes.DWORD, # dwFlags
927 ctypes.wintypes.DWORD, # dwReserved
928 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
929 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
930 ctypes.POINTER(OVERLAPPED) # Overlapped
932 LockFileEx.restype = ctypes.wintypes.BOOL
933 UnlockFileEx = kernel32.UnlockFileEx
934 UnlockFileEx.argtypes = [
935 ctypes.wintypes.HANDLE, # hFile
936 ctypes.wintypes.DWORD, # dwReserved
937 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
938 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
939 ctypes.POINTER(OVERLAPPED) # Overlapped
941 UnlockFileEx.restype = ctypes.wintypes.BOOL
942 whole_low = 0xffffffff
943 whole_high = 0x7fffffff
945 def _lock_file(f, exclusive):
946 overlapped = OVERLAPPED()
947 overlapped.Offset = 0
948 overlapped.OffsetHigh = 0
949 overlapped.hEvent = 0
950 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
951 handle = msvcrt.get_osfhandle(f.fileno())
952 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
953 whole_low, whole_high, f._lock_file_overlapped_p):
954 raise OSError('Locking file failed: %r' % ctypes.FormatError())
957 assert f._lock_file_overlapped_p
958 handle = msvcrt.get_osfhandle(f.fileno())
959 if not UnlockFileEx(handle, 0,
960 whole_low, whole_high, f._lock_file_overlapped_p):
961 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
966 def _lock_file(f, exclusive):
967 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
970 fcntl.flock(f, fcntl.LOCK_UN)
973 class locked_file(object):
974 def __init__(self, filename, mode, encoding=None):
975 assert mode in ['r', 'a', 'w']
976 self.f = io.open(filename, mode, encoding=encoding)
980 exclusive = self.mode != 'r'
982 _lock_file(self.f, exclusive)
988 def __exit__(self, etype, value, traceback):
997 def write(self, *args):
998 return self.f.write(*args)
1000 def read(self, *args):
1001 return self.f.read(*args)
1004 def get_filesystem_encoding():
1005 encoding = sys.getfilesystemencoding()
1006 return encoding if encoding is not None else 'utf-8'
1009 def shell_quote(args):
1011 encoding = get_filesystem_encoding()
1013 if isinstance(a, bytes):
1014 # We may get a filename encoded with 'encodeFilename'
1015 a = a.decode(encoding)
1016 quoted_args.append(pipes.quote(a))
1017 return ' '.join(quoted_args)
1020 def takewhile_inclusive(pred, seq):
1021 """ Like itertools.takewhile, but include the latest evaluated element
1022 (the first element so that Not pred(e)) """
1029 def smuggle_url(url, data):
1030 """ Pass additional data in a URL for internal use. """
1032 sdata = compat_urllib_parse.urlencode(
1033 {'__youtubedl_smuggle': json.dumps(data)})
1034 return url + '#' + sdata
1037 def unsmuggle_url(smug_url, default=None):
1038 if '#__youtubedl_smuggle' not in smug_url:
1039 return smug_url, default
1040 url, _, sdata = smug_url.rpartition('#')
1041 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1042 data = json.loads(jsond)
1046 def format_bytes(bytes):
1049 if type(bytes) is str:
1050 bytes = float(bytes)
1054 exponent = int(math.log(bytes, 1024.0))
1055 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1056 converted = float(bytes) / float(1024 ** exponent)
1057 return '%.2f%s' % (converted, suffix)
1060 def parse_filesize(s):
1064 # The lower-case forms are of course incorrect and inofficial,
1065 # but we support those too
1103 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1105 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1109 num_str = m.group('num').replace(',', '.')
1110 mult = _UNIT_TABLE[m.group('unit')]
1111 return int(float(num_str) * mult)
1114 def get_term_width():
1115 columns = compat_getenv('COLUMNS', None)
1120 sp = subprocess.Popen(
1122 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1123 out, err = sp.communicate()
1124 return int(out.split()[1])
1130 def month_by_name(name):
1131 """ Return the number of a month by (locale-independently) English name """
1134 'January', 'February', 'March', 'April', 'May', 'June',
1135 'July', 'August', 'September', 'October', 'November', 'December']
1137 return ENGLISH_NAMES.index(name) + 1
1142 def fix_xml_ampersands(xml_str):
1143 """Replace all the '&' by '&' in XML"""
1145 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1150 def setproctitle(title):
1151 assert isinstance(title, compat_str)
1153 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1156 title_bytes = title.encode('utf-8')
1157 buf = ctypes.create_string_buffer(len(title_bytes))
1158 buf.value = title_bytes
1160 libc.prctl(15, buf, 0, 0, 0)
1161 except AttributeError:
1162 return # Strange libc, just skip this
1165 def remove_start(s, start):
1166 if s.startswith(start):
1167 return s[len(start):]
1171 def remove_end(s, end):
1173 return s[:-len(end)]
1177 def url_basename(url):
1178 path = compat_urlparse.urlparse(url).path
1179 return path.strip('/').split('/')[-1]
1182 class HEADRequest(compat_urllib_request.Request):
1183 def get_method(self):
1187 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1190 v = getattr(v, get_attr, None)
1193 return default if v is None else (int(v) * invscale // scale)
1196 def str_or_none(v, default=None):
1197 return default if v is None else compat_str(v)
1200 def str_to_int(int_str):
1201 """ A more relaxed version of int_or_none """
1204 int_str = re.sub(r'[,\.\+]', '', int_str)
1208 def float_or_none(v, scale=1, invscale=1, default=None):
1209 return default if v is None else (float(v) * invscale / scale)
1212 def parse_duration(s):
1221 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1222 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1225 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1226 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1228 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1233 if m.group('only_mins'):
1234 return float_or_none(m.group('only_mins'), invscale=60)
1235 if m.group('only_hours'):
1236 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1238 res += int(m.group('secs'))
1240 res += int(m.group('mins')) * 60
1241 if m.group('hours'):
1242 res += int(m.group('hours')) * 60 * 60
1244 res += float(m.group('ms'))
1248 def prepend_extension(filename, ext):
1249 name, real_ext = os.path.splitext(filename)
1250 return '{0}.{1}{2}'.format(name, ext, real_ext)
1253 def check_executable(exe, args=[]):
1254 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1255 args can be a list of arguments for a short output (like -version) """
1257 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1263 def get_exe_version(exe, args=['--version'],
1264 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1265 unrecognized='present'):
1266 """ Returns the version of the specified executable,
1267 or False if the executable is not present """
1269 out, err = subprocess.Popen(
1271 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1274 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1275 m = re.search(version_re, firstline)
1282 class PagedList(object):
1284 # This is only useful for tests
1285 return len(self.getslice())
1288 class OnDemandPagedList(PagedList):
1289 def __init__(self, pagefunc, pagesize):
1290 self._pagefunc = pagefunc
1291 self._pagesize = pagesize
1293 def getslice(self, start=0, end=None):
1295 for pagenum in itertools.count(start // self._pagesize):
1296 firstid = pagenum * self._pagesize
1297 nextfirstid = pagenum * self._pagesize + self._pagesize
1298 if start >= nextfirstid:
1301 page_results = list(self._pagefunc(pagenum))
1304 start % self._pagesize
1305 if firstid <= start < nextfirstid
1309 ((end - 1) % self._pagesize) + 1
1310 if (end is not None and firstid <= end <= nextfirstid)
1313 if startv != 0 or endv is not None:
1314 page_results = page_results[startv:endv]
1315 res.extend(page_results)
1317 # A little optimization - if current page is not "full", ie. does
1318 # not contain page_size videos then we can assume that this page
1319 # is the last one - there are no more ids on further pages -
1320 # i.e. no need to query again.
1321 if len(page_results) + startv < self._pagesize:
1324 # If we got the whole page, but the next page is not interesting,
1325 # break out early as well
1326 if end == nextfirstid:
1331 class InAdvancePagedList(PagedList):
1332 def __init__(self, pagefunc, pagecount, pagesize):
1333 self._pagefunc = pagefunc
1334 self._pagecount = pagecount
1335 self._pagesize = pagesize
1337 def getslice(self, start=0, end=None):
1339 start_page = start // self._pagesize
1341 self._pagecount if end is None else (end // self._pagesize + 1))
1342 skip_elems = start - start_page * self._pagesize
1343 only_more = None if end is None else end - start
1344 for pagenum in range(start_page, end_page):
1345 page = list(self._pagefunc(pagenum))
1347 page = page[skip_elems:]
1349 if only_more is not None:
1350 if len(page) < only_more:
1351 only_more -= len(page)
1353 page = page[:only_more]
1360 def uppercase_escape(s):
1361 unicode_escape = codecs.getdecoder('unicode_escape')
1363 r'\\U[0-9a-fA-F]{8}',
1364 lambda m: unicode_escape(m.group(0))[0],
1368 def escape_rfc3986(s):
1369 """Escape non-ASCII characters as suggested by RFC 3986"""
1370 if sys.version_info < (3, 0) and isinstance(s, unicode):
1371 s = s.encode('utf-8')
1372 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1375 def escape_url(url):
1376 """Escape URL as suggested by RFC 3986"""
1377 url_parsed = compat_urllib_parse_urlparse(url)
1378 return url_parsed._replace(
1379 path=escape_rfc3986(url_parsed.path),
1380 params=escape_rfc3986(url_parsed.params),
1381 query=escape_rfc3986(url_parsed.query),
1382 fragment=escape_rfc3986(url_parsed.fragment)
1386 struct.pack('!I', 0)
1388 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1389 def struct_pack(spec, *args):
1390 if isinstance(spec, compat_str):
1391 spec = spec.encode('ascii')
1392 return struct.pack(spec, *args)
1394 def struct_unpack(spec, *args):
1395 if isinstance(spec, compat_str):
1396 spec = spec.encode('ascii')
1397 return struct.unpack(spec, *args)
1399 struct_pack = struct.pack
1400 struct_unpack = struct.unpack
1403 def read_batch_urls(batch_fd):
1405 if not isinstance(url, compat_str):
1406 url = url.decode('utf-8', 'replace')
1407 BOM_UTF8 = '\xef\xbb\xbf'
1408 if url.startswith(BOM_UTF8):
1409 url = url[len(BOM_UTF8):]
1411 if url.startswith(('#', ';', ']')):
1415 with contextlib.closing(batch_fd) as fd:
1416 return [url for url in map(fixup, fd) if url]
1419 def urlencode_postdata(*args, **kargs):
1420 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1424 etree_iter = xml.etree.ElementTree.Element.iter
1425 except AttributeError: # Python <=2.6
1426 etree_iter = lambda n: n.findall('.//*')
1430 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1431 def doctype(self, name, pubid, system):
1432 pass # Ignore doctypes
1434 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1435 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1436 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1437 # Fix up XML parser in Python 2.x
1438 if sys.version_info < (3, 0):
1439 for n in etree_iter(tree):
1440 if n.text is not None:
1441 if not isinstance(n.text, compat_str):
1442 n.text = n.text.decode('utf-8')
1455 def parse_age_limit(s):
1458 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1459 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1462 def strip_jsonp(code):
1464 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1467 def js_to_json(code):
1470 if v in ('true', 'false', 'null'):
1472 if v.startswith('"'):
1474 if v.startswith("'"):
1476 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1483 res = re.sub(r'''(?x)
1484 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1485 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1486 [a-zA-Z_][a-zA-Z_0-9]*
1488 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1492 def qualities(quality_ids):
1493 """ Get a numeric quality value out of a list of possible values """
1496 return quality_ids.index(qid)
1502 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1505 def limit_length(s, length):
1506 """ Add ellipses to overly long strings """
1511 return s[:length - len(ELLIPSES)] + ELLIPSES
1515 def version_tuple(v):
1516 return tuple(int(e) for e in re.split(r'[-.]', v))
1519 def is_outdated_version(version, limit, assume_new=True):
1521 return not assume_new
1523 return version_tuple(version) < version_tuple(limit)
1525 return not assume_new
1528 def ytdl_is_updateable():
1529 """ Returns if youtube-dl can be updated with -U """
1530 from zipimport import zipimporter
1532 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1535 def args_to_str(args):
1536 # Get a short string representation for a subprocess command
1537 return ' '.join(shlex_quote(a) for a in args)