2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
58 def preferredencoding():
59 """Get preferred encoding.
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
65 pref = locale.getpreferredencoding()
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
76 if sys.version_info < (3, 0):
77 encoding = get_filesystem_encoding()
78 # os.path.basename returns a bytes object, but NamedTemporaryFile
79 # will fail if the filename contains non ascii characters unless we
80 # use a unicode object
81 path_basename = lambda f: os.path.basename(fn).decode(encoding)
82 # the same for os.path.dirname
83 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
85 path_basename = os.path.basename
86 path_dirname = os.path.dirname
90 'prefix': path_basename(fn) + '.',
91 'dir': path_dirname(fn),
95 # In Python 2.x, json.dump expects a bytestream.
96 # In Python 3.x, it writes to a character stream
97 if sys.version_info < (3, 0):
105 tf = tempfile.NamedTemporaryFile(**args)
110 os.rename(tf.name, fn)
119 if sys.version_info >= (2, 7):
120 def find_xpath_attr(node, xpath, key, val):
121 """ Find the xpath xpath[@key=val] """
122 assert re.match(r'^[a-zA-Z-]+$', key)
123 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
124 expr = xpath + u"[@%s='%s']" % (key, val)
125 return node.find(expr)
127 def find_xpath_attr(node, xpath, key, val):
128 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
129 # .//node does not match if a node is a direct child of . !
130 if isinstance(xpath, unicode):
131 xpath = xpath.encode('ascii')
133 for f in node.findall(xpath):
134 if f.attrib.get(key) == val:
138 # On python2.6 the xml.etree.ElementTree.Element methods don't support
139 # the namespace parameter
140 def xpath_with_ns(path, ns_map):
141 components = [c.split(':') for c in path.split('/')]
145 replaced.append(c[0])
148 replaced.append('{%s}%s' % (ns_map[ns], tag))
149 return '/'.join(replaced)
152 def xpath_text(node, xpath, name=None, fatal=False):
153 if sys.version_info < (2, 7): # Crazy 2.6
154 xpath = xpath.encode('ascii')
159 name = xpath if name is None else name
160 raise ExtractorError('Could not find XML element %s' % name)
166 def get_element_by_id(id, html):
167 """Return the content of the tag with the specified ID in the passed HTML document"""
168 return get_element_by_attribute("id", id, html)
171 def get_element_by_attribute(attribute, value, html):
172 """Return the content of the tag with the specified attribute in the passed HTML document"""
174 m = re.search(r'''(?xs)
176 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
178 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
182 ''' % (re.escape(attribute), re.escape(value)), html)
186 res = m.group('content')
188 if res.startswith('"') or res.startswith("'"):
191 return unescapeHTML(res)
194 def clean_html(html):
195 """Clean an HTML snippet into a readable string"""
197 html = html.replace('\n', ' ')
198 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
199 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
201 html = re.sub('<.*?>', '', html)
202 # Replace html entities
203 html = unescapeHTML(html)
207 def sanitize_open(filename, open_mode):
208 """Try to open the given filename, and slightly tweak it if this fails.
210 Attempts to open the given filename. If this fails, it tries to change
211 the filename slightly, step by step, until it's either able to open it
212 or it fails and raises a final exception, like the standard open()
215 It returns the tuple (stream, definitive_file_name).
219 if sys.platform == 'win32':
221 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
222 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
223 stream = open(encodeFilename(filename), open_mode)
224 return (stream, filename)
225 except (IOError, OSError) as err:
226 if err.errno in (errno.EACCES,):
229 # In case of error, try to remove win32 forbidden chars
230 alt_filename = os.path.join(
231 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
232 for path_part in os.path.split(filename)
234 if alt_filename == filename:
237 # An exception here should be caught in the caller
238 stream = open(encodeFilename(filename), open_mode)
239 return (stream, alt_filename)
242 def timeconvert(timestr):
243 """Convert RFC 2822 defined time string into system timestamp"""
245 timetuple = email.utils.parsedate_tz(timestr)
246 if timetuple is not None:
247 timestamp = email.utils.mktime_tz(timetuple)
250 def sanitize_filename(s, restricted=False, is_id=False):
251 """Sanitizes a string so it could be used as part of a filename.
252 If restricted is set, use a stricter subset of allowed characters.
253 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
255 def replace_insane(char):
256 if char == '?' or ord(char) < 32 or ord(char) == 127:
259 return '' if restricted else '\''
261 return '_-' if restricted else ' -'
262 elif char in '\\/|*<>':
264 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
266 if restricted and ord(char) > 127:
270 result = ''.join(map(replace_insane, s))
272 while '__' in result:
273 result = result.replace('__', '_')
274 result = result.strip('_')
275 # Common case of "Foreign band name - English song title"
276 if restricted and result.startswith('-_'):
282 def orderedSet(iterable):
283 """ Remove all duplicates from the input iterable """
291 def _htmlentity_transform(entity):
292 """Transforms an HTML entity to a character."""
293 # Known non-numeric HTML entity
294 if entity in compat_html_entities.name2codepoint:
295 return compat_chr(compat_html_entities.name2codepoint[entity])
297 mobj = re.match(r'#(x?[0-9]+)', entity)
299 numstr = mobj.group(1)
300 if numstr.startswith('x'):
302 numstr = '0%s' % numstr
305 return compat_chr(int(numstr, base))
307 # Unknown entity in name, return its literal representation
308 return ('&%s;' % entity)
314 assert type(s) == compat_str
317 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
320 def encodeFilename(s, for_subprocess=False):
322 @param s The name of the file
325 assert type(s) == compat_str
327 # Python 3 has a Unicode API
328 if sys.version_info >= (3, 0):
331 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
332 # Pass '' directly to use Unicode APIs on Windows 2000 and up
333 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
334 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
335 if not for_subprocess:
338 # For subprocess calls, encode with locale encoding
339 # Refer to http://stackoverflow.com/a/9951851/35070
340 encoding = preferredencoding()
342 encoding = sys.getfilesystemencoding()
345 return s.encode(encoding, 'ignore')
348 def encodeArgument(s):
349 if not isinstance(s, compat_str):
350 # Legacy code that uses byte strings
351 # Uncomment the following line after fixing all post processors
352 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
353 s = s.decode('ascii')
354 return encodeFilename(s, True)
357 def decodeOption(optval):
360 if isinstance(optval, bytes):
361 optval = optval.decode(preferredencoding())
363 assert isinstance(optval, compat_str)
366 def formatSeconds(secs):
368 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
370 return '%d:%02d' % (secs // 60, secs % 60)
375 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
376 if sys.version_info < (3, 2):
379 class HTTPSConnectionV3(httplib.HTTPSConnection):
380 def __init__(self, *args, **kwargs):
381 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
384 sock = socket.create_connection((self.host, self.port), self.timeout)
385 if getattr(self, '_tunnel_host', False):
389 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
391 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
393 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
394 def https_open(self, req):
395 return self.do_open(HTTPSConnectionV3, req)
396 return HTTPSHandlerV3(**kwargs)
397 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
398 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
399 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
400 if opts_no_check_certificate:
401 context.verify_mode = ssl.CERT_NONE
402 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
404 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
405 context.verify_mode = (ssl.CERT_NONE
406 if opts_no_check_certificate
407 else ssl.CERT_REQUIRED)
408 context.set_default_verify_paths()
410 context.load_default_certs()
411 except AttributeError:
413 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
415 class ExtractorError(Exception):
416 """Error during info extraction."""
417 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
418 """ tb, if given, is the original traceback (so that it can be printed out).
419 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
422 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
424 if video_id is not None:
425 msg = video_id + ': ' + msg
427 msg += ' (caused by %r)' % cause
429 msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
430 super(ExtractorError, self).__init__(msg)
433 self.exc_info = sys.exc_info() # preserve original exception
435 self.video_id = video_id
437 def format_traceback(self):
438 if self.traceback is None:
440 return ''.join(traceback.format_tb(self.traceback))
443 class RegexNotFoundError(ExtractorError):
444 """Error when a regex didn't match"""
448 class DownloadError(Exception):
449 """Download Error exception.
451 This exception may be thrown by FileDownloader objects if they are not
452 configured to continue on errors. They will contain the appropriate
455 def __init__(self, msg, exc_info=None):
456 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
457 super(DownloadError, self).__init__(msg)
458 self.exc_info = exc_info
461 class SameFileError(Exception):
462 """Same File exception.
464 This exception will be thrown by FileDownloader objects if they detect
465 multiple files would have to be downloaded to the same file on disk.
470 class PostProcessingError(Exception):
471 """Post Processing exception.
473 This exception may be raised by PostProcessor's .run() method to
474 indicate an error in the postprocessing task.
476 def __init__(self, msg):
479 class MaxDownloadsReached(Exception):
480 """ --max-downloads limit has been reached. """
484 class UnavailableVideoError(Exception):
485 """Unavailable Format exception.
487 This exception will be thrown when a video is requested
488 in a format that is not available for that video.
493 class ContentTooShortError(Exception):
494 """Content Too Short exception.
496 This exception may be raised by FileDownloader objects when a file they
497 download is too small for what the server announced first, indicating
498 the connection was probably interrupted.
504 def __init__(self, downloaded, expected):
505 self.downloaded = downloaded
506 self.expected = expected
508 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
509 """Handler for HTTP requests and responses.
511 This class, when installed with an OpenerDirector, automatically adds
512 the standard headers to every HTTP request and handles gzipped and
513 deflated responses from web servers. If compression is to be avoided in
514 a particular request, the original request in the program code only has
515 to include the HTTP header "Youtubedl-No-Compression", which will be
516 removed before making the real request.
518 Part of this code was copied from:
520 http://techknack.net/python-urllib2-handlers/
522 Andrew Rowls, the author of that code, agreed to release it to the
529 return zlib.decompress(data, -zlib.MAX_WBITS)
531 return zlib.decompress(data)
534 def addinfourl_wrapper(stream, headers, url, code):
535 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
536 return compat_urllib_request.addinfourl(stream, headers, url, code)
537 ret = compat_urllib_request.addinfourl(stream, headers, url)
541 def http_request(self, req):
542 for h, v in std_headers.items():
543 if h not in req.headers:
545 if 'Youtubedl-no-compression' in req.headers:
546 if 'Accept-encoding' in req.headers:
547 del req.headers['Accept-encoding']
548 del req.headers['Youtubedl-no-compression']
549 if 'Youtubedl-user-agent' in req.headers:
550 if 'User-agent' in req.headers:
551 del req.headers['User-agent']
552 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
553 del req.headers['Youtubedl-user-agent']
555 if sys.version_info < (2, 7) and '#' in req.get_full_url():
556 # Python 2.6 is brain-dead when it comes to fragments
557 req._Request__original = req._Request__original.partition('#')[0]
558 req._Request__r_type = req._Request__r_type.partition('#')[0]
562 def http_response(self, req, resp):
565 if resp.headers.get('Content-encoding', '') == 'gzip':
566 content = resp.read()
567 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
569 uncompressed = io.BytesIO(gz.read())
570 except IOError as original_ioerror:
571 # There may be junk add the end of the file
572 # See http://stackoverflow.com/q/4928560/35070 for details
573 for i in range(1, 1024):
575 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
576 uncompressed = io.BytesIO(gz.read())
581 raise original_ioerror
582 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
583 resp.msg = old_resp.msg
585 if resp.headers.get('Content-encoding', '') == 'deflate':
586 gz = io.BytesIO(self.deflate(resp.read()))
587 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
588 resp.msg = old_resp.msg
591 https_request = http_request
592 https_response = http_response
595 def parse_iso8601(date_str, delimiter='T'):
596 """ Return a UNIX timestamp from the given date """
602 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
605 timezone = datetime.timedelta()
607 date_str = date_str[:-len(m.group(0))]
608 if not m.group('sign'):
609 timezone = datetime.timedelta()
611 sign = 1 if m.group('sign') == '+' else -1
612 timezone = datetime.timedelta(
613 hours=sign * int(m.group('hours')),
614 minutes=sign * int(m.group('minutes')))
615 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
616 dt = datetime.datetime.strptime(date_str, date_format) - timezone
617 return calendar.timegm(dt.timetuple())
620 def unified_strdate(date_str):
621 """Return a string with the date in the format YYYYMMDD"""
628 date_str = date_str.replace(',', ' ')
629 # %z (UTC offset) is only supported in python>=3.2
630 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
631 format_expressions = [
636 '%b %dst %Y %I:%M%p',
637 '%b %dnd %Y %I:%M%p',
638 '%b %dth %Y %I:%M%p',
647 '%Y-%m-%d %H:%M:%S.%f',
650 '%Y-%m-%dT%H:%M:%SZ',
651 '%Y-%m-%dT%H:%M:%S.%fZ',
652 '%Y-%m-%dT%H:%M:%S.%f0Z',
654 '%Y-%m-%dT%H:%M:%S.%f',
657 for expression in format_expressions:
659 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
662 if upload_date is None:
663 timetuple = email.utils.parsedate_tz(date_str)
665 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
668 def determine_ext(url, default_ext='unknown_video'):
671 guess = url.partition('?')[0].rpartition('.')[2]
672 if re.match(r'^[A-Za-z0-9]+$', guess):
677 def subtitles_filename(filename, sub_lang, sub_format):
678 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
680 def date_from_str(date_str):
682 Return a datetime object from a string in the format YYYYMMDD or
683 (now|today)[+-][0-9](day|week|month|year)(s)?"""
684 today = datetime.date.today()
685 if date_str == 'now'or date_str == 'today':
687 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
688 if match is not None:
689 sign = match.group('sign')
690 time = int(match.group('time'))
693 unit = match.group('unit')
702 delta = datetime.timedelta(**{unit: time})
704 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
706 def hyphenate_date(date_str):
708 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
709 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
710 if match is not None:
711 return '-'.join(match.groups())
715 class DateRange(object):
716 """Represents a time interval between two dates"""
717 def __init__(self, start=None, end=None):
718 """start and end must be strings in the format accepted by date"""
719 if start is not None:
720 self.start = date_from_str(start)
722 self.start = datetime.datetime.min.date()
724 self.end = date_from_str(end)
726 self.end = datetime.datetime.max.date()
727 if self.start > self.end:
728 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
731 """Returns a range that only contains the given day"""
733 def __contains__(self, date):
734 """Check if the date is in the range"""
735 if not isinstance(date, datetime.date):
736 date = date_from_str(date)
737 return self.start <= date <= self.end
739 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
743 """ Returns the platform name as a compat_str """
744 res = platform.platform()
745 if isinstance(res, bytes):
746 res = res.decode(preferredencoding())
748 assert isinstance(res, compat_str)
752 def _windows_write_string(s, out):
753 """ Returns True if the string was written using special methods,
754 False if it has yet to be written out."""
755 # Adapted from http://stackoverflow.com/a/3259271/35070
758 import ctypes.wintypes
766 fileno = out.fileno()
767 except AttributeError:
768 # If the output stream doesn't have a fileno, it's virtual
770 if fileno not in WIN_OUTPUT_IDS:
773 GetStdHandle = ctypes.WINFUNCTYPE(
774 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
775 ("GetStdHandle", ctypes.windll.kernel32))
776 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
778 WriteConsoleW = ctypes.WINFUNCTYPE(
779 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
780 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
781 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
782 written = ctypes.wintypes.DWORD(0)
784 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
785 FILE_TYPE_CHAR = 0x0002
786 FILE_TYPE_REMOTE = 0x8000
787 GetConsoleMode = ctypes.WINFUNCTYPE(
788 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
789 ctypes.POINTER(ctypes.wintypes.DWORD))(
790 ("GetConsoleMode", ctypes.windll.kernel32))
791 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
793 def not_a_console(handle):
794 if handle == INVALID_HANDLE_VALUE or handle is None:
796 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
797 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
802 def next_nonbmp_pos(s):
804 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
805 except StopIteration:
809 count = min(next_nonbmp_pos(s), 1024)
812 h, s, count if count else 2, ctypes.byref(written), None)
814 raise OSError('Failed to write string')
815 if not count: # We just wrote a non-BMP character
816 assert written.value == 2
819 assert written.value > 0
820 s = s[written.value:]
824 def write_string(s, out=None, encoding=None):
827 assert type(s) == compat_str
829 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
830 if _windows_write_string(s, out):
833 if ('b' in getattr(out, 'mode', '') or
834 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
835 byt = s.encode(encoding or preferredencoding(), 'ignore')
837 elif hasattr(out, 'buffer'):
838 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
839 byt = s.encode(enc, 'ignore')
840 out.buffer.write(byt)
846 def bytes_to_intlist(bs):
849 if isinstance(bs[0], int): # Python 3
852 return [ord(c) for c in bs]
855 def intlist_to_bytes(xs):
858 return struct_pack('%dB' % len(xs), *xs)
861 # Cross-platform file locking
862 if sys.platform == 'win32':
863 import ctypes.wintypes
866 class OVERLAPPED(ctypes.Structure):
868 ('Internal', ctypes.wintypes.LPVOID),
869 ('InternalHigh', ctypes.wintypes.LPVOID),
870 ('Offset', ctypes.wintypes.DWORD),
871 ('OffsetHigh', ctypes.wintypes.DWORD),
872 ('hEvent', ctypes.wintypes.HANDLE),
875 kernel32 = ctypes.windll.kernel32
876 LockFileEx = kernel32.LockFileEx
877 LockFileEx.argtypes = [
878 ctypes.wintypes.HANDLE, # hFile
879 ctypes.wintypes.DWORD, # dwFlags
880 ctypes.wintypes.DWORD, # dwReserved
881 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
882 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
883 ctypes.POINTER(OVERLAPPED) # Overlapped
885 LockFileEx.restype = ctypes.wintypes.BOOL
886 UnlockFileEx = kernel32.UnlockFileEx
887 UnlockFileEx.argtypes = [
888 ctypes.wintypes.HANDLE, # hFile
889 ctypes.wintypes.DWORD, # dwReserved
890 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
891 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
892 ctypes.POINTER(OVERLAPPED) # Overlapped
894 UnlockFileEx.restype = ctypes.wintypes.BOOL
895 whole_low = 0xffffffff
896 whole_high = 0x7fffffff
898 def _lock_file(f, exclusive):
899 overlapped = OVERLAPPED()
900 overlapped.Offset = 0
901 overlapped.OffsetHigh = 0
902 overlapped.hEvent = 0
903 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
904 handle = msvcrt.get_osfhandle(f.fileno())
905 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
906 whole_low, whole_high, f._lock_file_overlapped_p):
907 raise OSError('Locking file failed: %r' % ctypes.FormatError())
910 assert f._lock_file_overlapped_p
911 handle = msvcrt.get_osfhandle(f.fileno())
912 if not UnlockFileEx(handle, 0,
913 whole_low, whole_high, f._lock_file_overlapped_p):
914 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
919 def _lock_file(f, exclusive):
920 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
923 fcntl.flock(f, fcntl.LOCK_UN)
926 class locked_file(object):
927 def __init__(self, filename, mode, encoding=None):
928 assert mode in ['r', 'a', 'w']
929 self.f = io.open(filename, mode, encoding=encoding)
933 exclusive = self.mode != 'r'
935 _lock_file(self.f, exclusive)
941 def __exit__(self, etype, value, traceback):
950 def write(self, *args):
951 return self.f.write(*args)
953 def read(self, *args):
954 return self.f.read(*args)
957 def get_filesystem_encoding():
958 encoding = sys.getfilesystemencoding()
959 return encoding if encoding is not None else 'utf-8'
962 def shell_quote(args):
964 encoding = get_filesystem_encoding()
966 if isinstance(a, bytes):
967 # We may get a filename encoded with 'encodeFilename'
968 a = a.decode(encoding)
969 quoted_args.append(pipes.quote(a))
970 return ' '.join(quoted_args)
973 def takewhile_inclusive(pred, seq):
974 """ Like itertools.takewhile, but include the latest evaluated element
975 (the first element so that Not pred(e)) """
982 def smuggle_url(url, data):
983 """ Pass additional data in a URL for internal use. """
985 sdata = compat_urllib_parse.urlencode(
986 {'__youtubedl_smuggle': json.dumps(data)})
987 return url + '#' + sdata
990 def unsmuggle_url(smug_url, default=None):
991 if not '#__youtubedl_smuggle' in smug_url:
992 return smug_url, default
993 url, _, sdata = smug_url.rpartition('#')
994 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
995 data = json.loads(jsond)
999 def format_bytes(bytes):
1002 if type(bytes) is str:
1003 bytes = float(bytes)
1007 exponent = int(math.log(bytes, 1024.0))
1008 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1009 converted = float(bytes) / float(1024 ** exponent)
1010 return '%.2f%s' % (converted, suffix)
1013 def get_term_width():
1014 columns = compat_getenv('COLUMNS', None)
1019 sp = subprocess.Popen(
1021 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1022 out, err = sp.communicate()
1023 return int(out.split()[1])
1029 def month_by_name(name):
1030 """ Return the number of a month by (locale-independently) English name """
1033 'January', 'February', 'March', 'April', 'May', 'June',
1034 'July', 'August', 'September', 'October', 'November', 'December']
1036 return ENGLISH_NAMES.index(name) + 1
1041 def fix_xml_ampersands(xml_str):
1042 """Replace all the '&' by '&' in XML"""
1044 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1049 def setproctitle(title):
1050 assert isinstance(title, compat_str)
1052 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1055 title_bytes = title.encode('utf-8')
1056 buf = ctypes.create_string_buffer(len(title_bytes))
1057 buf.value = title_bytes
1059 libc.prctl(15, buf, 0, 0, 0)
1060 except AttributeError:
1061 return # Strange libc, just skip this
1064 def remove_start(s, start):
1065 if s.startswith(start):
1066 return s[len(start):]
1070 def remove_end(s, end):
1072 return s[:-len(end)]
1076 def url_basename(url):
1077 path = compat_urlparse.urlparse(url).path
1078 return path.strip('/').split('/')[-1]
1081 class HEADRequest(compat_urllib_request.Request):
1082 def get_method(self):
1086 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1089 v = getattr(v, get_attr, None)
1092 return default if v is None else (int(v) * invscale // scale)
1095 def str_or_none(v, default=None):
1096 return default if v is None else compat_str(v)
1099 def str_to_int(int_str):
1100 """ A more relaxed version of int_or_none """
1103 int_str = re.sub(r'[,\.\+]', '', int_str)
1107 def float_or_none(v, scale=1, invscale=1, default=None):
1108 return default if v is None else (float(v) * invscale / scale)
1111 def parse_duration(s):
1120 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1121 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1123 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
1126 res = int(m.group('secs'))
1128 res += int(m.group('mins')) * 60
1129 if m.group('hours'):
1130 res += int(m.group('hours')) * 60 * 60
1132 res += float(m.group('ms'))
1136 def prepend_extension(filename, ext):
1137 name, real_ext = os.path.splitext(filename)
1138 return '{0}.{1}{2}'.format(name, ext, real_ext)
1141 def check_executable(exe, args=[]):
1142 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1143 args can be a list of arguments for a short output (like -version) """
1145 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1151 def get_exe_version(exe, args=['--version'],
1152 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1153 unrecognized='present'):
1154 """ Returns the version of the specified executable,
1155 or False if the executable is not present """
1157 out, err = subprocess.Popen(
1159 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1162 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1163 m = re.search(version_re, firstline)
1170 class PagedList(object):
1172 # This is only useful for tests
1173 return len(self.getslice())
1176 class OnDemandPagedList(PagedList):
1177 def __init__(self, pagefunc, pagesize):
1178 self._pagefunc = pagefunc
1179 self._pagesize = pagesize
1181 def getslice(self, start=0, end=None):
1183 for pagenum in itertools.count(start // self._pagesize):
1184 firstid = pagenum * self._pagesize
1185 nextfirstid = pagenum * self._pagesize + self._pagesize
1186 if start >= nextfirstid:
1189 page_results = list(self._pagefunc(pagenum))
1192 start % self._pagesize
1193 if firstid <= start < nextfirstid
1197 ((end - 1) % self._pagesize) + 1
1198 if (end is not None and firstid <= end <= nextfirstid)
1201 if startv != 0 or endv is not None:
1202 page_results = page_results[startv:endv]
1203 res.extend(page_results)
1205 # A little optimization - if current page is not "full", ie. does
1206 # not contain page_size videos then we can assume that this page
1207 # is the last one - there are no more ids on further pages -
1208 # i.e. no need to query again.
1209 if len(page_results) + startv < self._pagesize:
1212 # If we got the whole page, but the next page is not interesting,
1213 # break out early as well
1214 if end == nextfirstid:
1219 class InAdvancePagedList(PagedList):
1220 def __init__(self, pagefunc, pagecount, pagesize):
1221 self._pagefunc = pagefunc
1222 self._pagecount = pagecount
1223 self._pagesize = pagesize
1225 def getslice(self, start=0, end=None):
1227 start_page = start // self._pagesize
1229 self._pagecount if end is None else (end // self._pagesize + 1))
1230 skip_elems = start - start_page * self._pagesize
1231 only_more = None if end is None else end - start
1232 for pagenum in range(start_page, end_page):
1233 page = list(self._pagefunc(pagenum))
1235 page = page[skip_elems:]
1237 if only_more is not None:
1238 if len(page) < only_more:
1239 only_more -= len(page)
1241 page = page[:only_more]
1248 def uppercase_escape(s):
1249 unicode_escape = codecs.getdecoder('unicode_escape')
1251 r'\\U[0-9a-fA-F]{8}',
1252 lambda m: unicode_escape(m.group(0))[0],
1256 def escape_rfc3986(s):
1257 """Escape non-ASCII characters as suggested by RFC 3986"""
1258 if sys.version_info < (3, 0) and isinstance(s, unicode):
1259 s = s.encode('utf-8')
1260 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1263 def escape_url(url):
1264 """Escape URL as suggested by RFC 3986"""
1265 url_parsed = compat_urllib_parse_urlparse(url)
1266 return url_parsed._replace(
1267 path=escape_rfc3986(url_parsed.path),
1268 params=escape_rfc3986(url_parsed.params),
1269 query=escape_rfc3986(url_parsed.query),
1270 fragment=escape_rfc3986(url_parsed.fragment)
1274 struct.pack('!I', 0)
1276 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1277 def struct_pack(spec, *args):
1278 if isinstance(spec, compat_str):
1279 spec = spec.encode('ascii')
1280 return struct.pack(spec, *args)
1282 def struct_unpack(spec, *args):
1283 if isinstance(spec, compat_str):
1284 spec = spec.encode('ascii')
1285 return struct.unpack(spec, *args)
1287 struct_pack = struct.pack
1288 struct_unpack = struct.unpack
1291 def read_batch_urls(batch_fd):
1293 if not isinstance(url, compat_str):
1294 url = url.decode('utf-8', 'replace')
1295 BOM_UTF8 = '\xef\xbb\xbf'
1296 if url.startswith(BOM_UTF8):
1297 url = url[len(BOM_UTF8):]
1299 if url.startswith(('#', ';', ']')):
1303 with contextlib.closing(batch_fd) as fd:
1304 return [url for url in map(fixup, fd) if url]
1307 def urlencode_postdata(*args, **kargs):
1308 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1312 etree_iter = xml.etree.ElementTree.Element.iter
1313 except AttributeError: # Python <=2.6
1314 etree_iter = lambda n: n.findall('.//*')
1318 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1319 def doctype(self, name, pubid, system):
1320 pass # Ignore doctypes
1322 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1323 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1324 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1325 # Fix up XML parser in Python 2.x
1326 if sys.version_info < (3, 0):
1327 for n in etree_iter(tree):
1328 if n.text is not None:
1329 if not isinstance(n.text, compat_str):
1330 n.text = n.text.decode('utf-8')
1343 def parse_age_limit(s):
1346 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1347 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1350 def strip_jsonp(code):
1352 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1355 def js_to_json(code):
1358 if v in ('true', 'false', 'null'):
1360 if v.startswith('"'):
1362 if v.startswith("'"):
1364 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1371 res = re.sub(r'''(?x)
1372 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1373 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1374 [a-zA-Z_][a-zA-Z_0-9]*
1376 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1380 def qualities(quality_ids):
1381 """ Get a numeric quality value out of a list of possible values """
1384 return quality_ids.index(qid)
1390 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1393 def limit_length(s, length):
1394 """ Add ellipses to overly long strings """
1399 return s[:length - len(ELLIPSES)] + ELLIPSES
1403 def version_tuple(v):
1404 return [int(e) for e in v.split('.')]
1407 def is_outdated_version(version, limit, assume_new=True):
1409 return not assume_new
1411 return version_tuple(version) < version_tuple(limit)
1413 return not assume_new