utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_getenv,
  39     compat_html_entities,
  40     compat_http_client,
  41     compat_parse_qs,
  42     compat_socket_create_connection,
  43     compat_str,
  44     compat_urllib_error,
  45     compat_urllib_parse,
  46     compat_urllib_parse_urlparse,
  47     compat_urllib_request,
  48     compat_urlparse,
  49     shlex_quote,
  50 )
  51
  52
  53 # This is not clearly defined otherwise
  54 compiled_regex_type = type(re.compile(''))
  55
  56 std_headers = {
  57     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  58     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  59     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  60     'Accept-Encoding': 'gzip, deflate',
  61     'Accept-Language': 'en-us,en;q=0.5',
  62 }
  63
  64
  65 def preferredencoding():
  66     """Get preferred encoding.
  67
  68     Returns the best encoding scheme for the system, based on
  69     locale.getpreferredencoding() and some further tweaks.
  70     """
  71     try:
  72         pref = locale.getpreferredencoding()
  73         'TEST'.encode(pref)
  74     except:
  75         pref = 'UTF-8'
  76
  77     return pref
  78
  79
  80 def write_json_file(obj, fn):
  81     """ Encode obj as JSON and write it to fn, atomically if possible """
  82
  83     fn = encodeFilename(fn)
  84     if sys.version_info < (3, 0) and sys.platform != 'win32':
  85         encoding = get_filesystem_encoding()
  86         # os.path.basename returns a bytes object, but NamedTemporaryFile
  87         # will fail if the filename contains non ascii characters unless we
  88         # use a unicode object
  89         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  90         # the same for os.path.dirname
  91         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  92     else:
  93         path_basename = os.path.basename
  94         path_dirname = os.path.dirname
  95
  96     args = {
  97         'suffix': '.tmp',
  98         'prefix': path_basename(fn) + '.',
  99         'dir': path_dirname(fn),
 100         'delete': False,
 101     }
 102
 103     # In Python 2.x, json.dump expects a bytestream.
 104     # In Python 3.x, it writes to a character stream
 105     if sys.version_info < (3, 0):
 106         args['mode'] = 'wb'
 107     else:
 108         args.update({
 109             'mode': 'w',
 110             'encoding': 'utf-8',
 111         })
 112
 113     tf = tempfile.NamedTemporaryFile(**args)
 114
 115     try:
 116         with tf:
 117             json.dump(obj, tf)
 118         if sys.platform == 'win32':
 119             # Need to remove existing file on Windows, else os.rename raises
 120             # WindowsError or FileExistsError.
 121             try:
 122                 os.unlink(fn)
 123             except OSError:
 124                 pass
 125         os.rename(tf.name, fn)
 126     except:
 127         try:
 128             os.remove(tf.name)
 129         except OSError:
 130             pass
 131         raise
 132
 133
 134 if sys.version_info >= (2, 7):
 135     def find_xpath_attr(node, xpath, key, val):
 136         """ Find the xpath xpath[@key=val] """
 137         assert re.match(r'^[a-zA-Z-]+$', key)
 138         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 139         expr = xpath + "[@%s='%s']" % (key, val)
 140         return node.find(expr)
 141 else:
 142     def find_xpath_attr(node, xpath, key, val):
 143         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 144         # .//node does not match if a node is a direct child of . !
 145         if isinstance(xpath, compat_str):
 146             xpath = xpath.encode('ascii')
 147
 148         for f in node.findall(xpath):
 149             if f.attrib.get(key) == val:
 150                 return f
 151         return None
 152
 153 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 154 # the namespace parameter
 155
 156
 157 def xpath_with_ns(path, ns_map):
 158     components = [c.split(':') for c in path.split('/')]
 159     replaced = []
 160     for c in components:
 161         if len(c) == 1:
 162             replaced.append(c[0])
 163         else:
 164             ns, tag = c
 165             replaced.append('{%s}%s' % (ns_map[ns], tag))
 166     return '/'.join(replaced)
 167
 168
 169 def xpath_text(node, xpath, name=None, fatal=False):
 170     if sys.version_info < (2, 7):  # Crazy 2.6
 171         xpath = xpath.encode('ascii')
 172
 173     n = node.find(xpath)
 174     if n is None or n.text is None:
 175         if fatal:
 176             name = xpath if name is None else name
 177             raise ExtractorError('Could not find XML element %s' % name)
 178         else:
 179             return None
 180     return n.text
 181
 182
 183 def get_element_by_id(id, html):
 184     """Return the content of the tag with the specified ID in the passed HTML document"""
 185     return get_element_by_attribute("id", id, html)
 186
 187
 188 def get_element_by_attribute(attribute, value, html):
 189     """Return the content of the tag with the specified attribute in the passed HTML document"""
 190
 191     m = re.search(r'''(?xs)
 192         <([a-zA-Z0-9:._-]+)
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194          \s+%s=['"]?%s['"]?
 195          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 196         \s*>
 197         (?P<content>.*?)
 198         </\1>
 199     ''' % (re.escape(attribute), re.escape(value)), html)
 200
 201     if not m:
 202         return None
 203     res = m.group('content')
 204
 205     if res.startswith('"') or res.startswith("'"):
 206         res = res[1:-1]
 207
 208     return unescapeHTML(res)
 209
 210
 211 def clean_html(html):
 212     """Clean an HTML snippet into a readable string"""
 213
 214     if html is None:  # Convenience for sanitizing descriptions etc.
 215         return html
 216
 217     # Newline vs <br />
 218     html = html.replace('\n', ' ')
 219     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 220     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 221     # Strip html tags
 222     html = re.sub('<.*?>', '', html)
 223     # Replace html entities
 224     html = unescapeHTML(html)
 225     return html.strip()
 226
 227
 228 def sanitize_open(filename, open_mode):
 229     """Try to open the given filename, and slightly tweak it if this fails.
 230
 231     Attempts to open the given filename. If this fails, it tries to change
 232     the filename slightly, step by step, until it's either able to open it
 233     or it fails and raises a final exception, like the standard open()
 234     function.
 235
 236     It returns the tuple (stream, definitive_file_name).
 237     """
 238     try:
 239         if filename == '-':
 240             if sys.platform == 'win32':
 241                 import msvcrt
 242                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 243             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 244         stream = open(encodeFilename(filename), open_mode)
 245         return (stream, filename)
 246     except (IOError, OSError) as err:
 247         if err.errno in (errno.EACCES,):
 248             raise
 249
 250         # In case of error, try to remove win32 forbidden chars
 251         alt_filename = os.path.join(
 252             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 253             for path_part in os.path.split(filename)
 254         )
 255         if alt_filename == filename:
 256             raise
 257         else:
 258             # An exception here should be caught in the caller
 259             stream = open(encodeFilename(filename), open_mode)
 260             return (stream, alt_filename)
 261
 262
 263 def timeconvert(timestr):
 264     """Convert RFC 2822 defined time string into system timestamp"""
 265     timestamp = None
 266     timetuple = email.utils.parsedate_tz(timestr)
 267     if timetuple is not None:
 268         timestamp = email.utils.mktime_tz(timetuple)
 269     return timestamp
 270
 271
 272 def sanitize_filename(s, restricted=False, is_id=False):
 273     """Sanitizes a string so it could be used as part of a filename.
 274     If restricted is set, use a stricter subset of allowed characters.
 275     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 276     """
 277     def replace_insane(char):
 278         if char == '?' or ord(char) < 32 or ord(char) == 127:
 279             return ''
 280         elif char == '"':
 281             return '' if restricted else '\''
 282         elif char == ':':
 283             return '_-' if restricted else ' -'
 284         elif char in '\\/|*<>':
 285             return '_'
 286         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 287             return '_'
 288         if restricted and ord(char) > 127:
 289             return '_'
 290         return char
 291
 292     # Handle timestamps
 293     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 294     result = ''.join(map(replace_insane, s))
 295     if not is_id:
 296         while '__' in result:
 297             result = result.replace('__', '_')
 298         result = result.strip('_')
 299         # Common case of "Foreign band name - English song title"
 300         if restricted and result.startswith('-_'):
 301             result = result[2:]
 302         if not result:
 303             result = '_'
 304     return result
 305
 306
 307 def orderedSet(iterable):
 308     """ Remove all duplicates from the input iterable """
 309     res = []
 310     for el in iterable:
 311         if el not in res:
 312             res.append(el)
 313     return res
 314
 315
 316 def _htmlentity_transform(entity):
 317     """Transforms an HTML entity to a character."""
 318     # Known non-numeric HTML entity
 319     if entity in compat_html_entities.name2codepoint:
 320         return compat_chr(compat_html_entities.name2codepoint[entity])
 321
 322     mobj = re.match(r'#(x?[0-9]+)', entity)
 323     if mobj is not None:
 324         numstr = mobj.group(1)
 325         if numstr.startswith('x'):
 326             base = 16
 327             numstr = '0%s' % numstr
 328         else:
 329             base = 10
 330         return compat_chr(int(numstr, base))
 331
 332     # Unknown entity in name, return its literal representation
 333     return ('&%s;' % entity)
 334
 335
 336 def unescapeHTML(s):
 337     if s is None:
 338         return None
 339     assert type(s) == compat_str
 340
 341     return re.sub(
 342         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 343
 344
 345 def encodeFilename(s, for_subprocess=False):
 346     """
 347     @param s The name of the file
 348     """
 349
 350     assert type(s) == compat_str
 351
 352     # Python 3 has a Unicode API
 353     if sys.version_info >= (3, 0):
 354         return s
 355
 356     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 357         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 358         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 359         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 360         if not for_subprocess:
 361             return s
 362         else:
 363             # For subprocess calls, encode with locale encoding
 364             # Refer to http://stackoverflow.com/a/9951851/35070
 365             encoding = preferredencoding()
 366     else:
 367         encoding = sys.getfilesystemencoding()
 368     if encoding is None:
 369         encoding = 'utf-8'
 370     return s.encode(encoding, 'ignore')
 371
 372
 373 def encodeArgument(s):
 374     if not isinstance(s, compat_str):
 375         # Legacy code that uses byte strings
 376         # Uncomment the following line after fixing all post processors
 377         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 378         s = s.decode('ascii')
 379     return encodeFilename(s, True)
 380
 381
 382 def decodeOption(optval):
 383     if optval is None:
 384         return optval
 385     if isinstance(optval, bytes):
 386         optval = optval.decode(preferredencoding())
 387
 388     assert isinstance(optval, compat_str)
 389     return optval
 390
 391
 392 def formatSeconds(secs):
 393     if secs > 3600:
 394         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 395     elif secs > 60:
 396         return '%d:%02d' % (secs // 60, secs % 60)
 397     else:
 398         return '%d' % secs
 399
 400
 401 def make_HTTPS_handler(params, **kwargs):
 402     opts_no_check_certificate = params.get('nocheckcertificate', False)
 403     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 404         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 405         if opts_no_check_certificate:
 406             context.check_hostname = False
 407             context.verify_mode = ssl.CERT_NONE
 408         try:
 409             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 410         except TypeError:
 411             # Python 2.7.8
 412             # (create_default_context present but HTTPSHandler has no context=)
 413             pass
 414
 415     if sys.version_info < (3, 2):
 416         return YoutubeDLHTTPSHandler(params, **kwargs)
 417     else:  # Python < 3.4
 418         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 419         context.verify_mode = (ssl.CERT_NONE
 420                                if opts_no_check_certificate
 421                                else ssl.CERT_REQUIRED)
 422         context.set_default_verify_paths()
 423         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 424
 425
 426 class ExtractorError(Exception):
 427     """Error during info extraction."""
 428
 429     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 430         """ tb, if given, is the original traceback (so that it can be printed out).
 431         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 432         """
 433
 434         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 435             expected = True
 436         if video_id is not None:
 437             msg = video_id + ': ' + msg
 438         if cause:
 439             msg += ' (caused by %r)' % cause
 440         if not expected:
 441             if ytdl_is_updateable():
 442                 update_cmd = 'type  youtube-dl -U  to update'
 443             else:
 444                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 445             msg += '; please report this issue on https://yt-dl.org/bug .'
 446             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 447             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 448         super(ExtractorError, self).__init__(msg)
 449
 450         self.traceback = tb
 451         self.exc_info = sys.exc_info()  # preserve original exception
 452         self.cause = cause
 453         self.video_id = video_id
 454
 455     def format_traceback(self):
 456         if self.traceback is None:
 457             return None
 458         return ''.join(traceback.format_tb(self.traceback))
 459
 460
 461 class UnsupportedError(ExtractorError):
 462     def __init__(self, url):
 463         super(UnsupportedError, self).__init__(
 464             'Unsupported URL: %s' % url, expected=True)
 465         self.url = url
 466
 467
 468 class RegexNotFoundError(ExtractorError):
 469     """Error when a regex didn't match"""
 470     pass
 471
 472
 473 class DownloadError(Exception):
 474     """Download Error exception.
 475
 476     This exception may be thrown by FileDownloader objects if they are not
 477     configured to continue on errors. They will contain the appropriate
 478     error message.
 479     """
 480
 481     def __init__(self, msg, exc_info=None):
 482         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 483         super(DownloadError, self).__init__(msg)
 484         self.exc_info = exc_info
 485
 486
 487 class SameFileError(Exception):
 488     """Same File exception.
 489
 490     This exception will be thrown by FileDownloader objects if they detect
 491     multiple files would have to be downloaded to the same file on disk.
 492     """
 493     pass
 494
 495
 496 class PostProcessingError(Exception):
 497     """Post Processing exception.
 498
 499     This exception may be raised by PostProcessor's .run() method to
 500     indicate an error in the postprocessing task.
 501     """
 502
 503     def __init__(self, msg):
 504         self.msg = msg
 505
 506
 507 class MaxDownloadsReached(Exception):
 508     """ --max-downloads limit has been reached. """
 509     pass
 510
 511
 512 class UnavailableVideoError(Exception):
 513     """Unavailable Format exception.
 514
 515     This exception will be thrown when a video is requested
 516     in a format that is not available for that video.
 517     """
 518     pass
 519
 520
 521 class ContentTooShortError(Exception):
 522     """Content Too Short exception.
 523
 524     This exception may be raised by FileDownloader objects when a file they
 525     download is too small for what the server announced first, indicating
 526     the connection was probably interrupted.
 527     """
 528     # Both in bytes
 529     downloaded = None
 530     expected = None
 531
 532     def __init__(self, downloaded, expected):
 533         self.downloaded = downloaded
 534         self.expected = expected
 535
 536
 537 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 538     hc = http_class(*args, **kwargs)
 539     source_address = ydl_handler._params.get('source_address')
 540     if source_address is not None:
 541         sa = (source_address, 0)
 542         if hasattr(hc, 'source_address'):  # Python 2.7+
 543             hc.source_address = sa
 544         else:  # Python 2.6
 545             def _hc_connect(self, *args, **kwargs):
 546                 sock = compat_socket_create_connection(
 547                     (self.host, self.port), self.timeout, sa)
 548                 if is_https:
 549                     self.sock = ssl.wrap_socket(
 550                         sock, self.key_file, self.cert_file,
 551                         ssl_version=ssl.PROTOCOL_TLSv1)
 552                 else:
 553                     self.sock = sock
 554             hc.connect = functools.partial(_hc_connect, hc)
 555
 556     return hc
 557
 558
 559 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 560     """Handler for HTTP requests and responses.
 561
 562     This class, when installed with an OpenerDirector, automatically adds
 563     the standard headers to every HTTP request and handles gzipped and
 564     deflated responses from web servers. If compression is to be avoided in
 565     a particular request, the original request in the program code only has
 566     to include the HTTP header "Youtubedl-No-Compression", which will be
 567     removed before making the real request.
 568
 569     Part of this code was copied from:
 570
 571     http://techknack.net/python-urllib2-handlers/
 572
 573     Andrew Rowls, the author of that code, agreed to release it to the
 574     public domain.
 575     """
 576
 577     def __init__(self, params, *args, **kwargs):
 578         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 579         self._params = params
 580
 581     def http_open(self, req):
 582         return self.do_open(functools.partial(
 583             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 584             req)
 585
 586     @staticmethod
 587     def deflate(data):
 588         try:
 589             return zlib.decompress(data, -zlib.MAX_WBITS)
 590         except zlib.error:
 591             return zlib.decompress(data)
 592
 593     @staticmethod
 594     def addinfourl_wrapper(stream, headers, url, code):
 595         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 596             return compat_urllib_request.addinfourl(stream, headers, url, code)
 597         ret = compat_urllib_request.addinfourl(stream, headers, url)
 598         ret.code = code
 599         return ret
 600
 601     def http_request(self, req):
 602         for h, v in std_headers.items():
 603             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 604             # The dict keys are capitalized because of this bug by urllib
 605             if h.capitalize() not in req.headers:
 606                 req.add_header(h, v)
 607         if 'Youtubedl-no-compression' in req.headers:
 608             if 'Accept-encoding' in req.headers:
 609                 del req.headers['Accept-encoding']
 610             del req.headers['Youtubedl-no-compression']
 611
 612         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 613             # Python 2.6 is brain-dead when it comes to fragments
 614             req._Request__original = req._Request__original.partition('#')[0]
 615             req._Request__r_type = req._Request__r_type.partition('#')[0]
 616
 617         return req
 618
 619     def http_response(self, req, resp):
 620         old_resp = resp
 621         # gzip
 622         if resp.headers.get('Content-encoding', '') == 'gzip':
 623             content = resp.read()
 624             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 625             try:
 626                 uncompressed = io.BytesIO(gz.read())
 627             except IOError as original_ioerror:
 628                 # There may be junk add the end of the file
 629                 # See http://stackoverflow.com/q/4928560/35070 for details
 630                 for i in range(1, 1024):
 631                     try:
 632                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 633                         uncompressed = io.BytesIO(gz.read())
 634                     except IOError:
 635                         continue
 636                     break
 637                 else:
 638                     raise original_ioerror
 639             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 640             resp.msg = old_resp.msg
 641         # deflate
 642         if resp.headers.get('Content-encoding', '') == 'deflate':
 643             gz = io.BytesIO(self.deflate(resp.read()))
 644             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 645             resp.msg = old_resp.msg
 646         return resp
 647
 648     https_request = http_request
 649     https_response = http_response
 650
 651
 652 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 653     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 654         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 655         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 656         self._params = params
 657
 658     def https_open(self, req):
 659         kwargs = {}
 660         if hasattr(self, '_context'):  # python > 2.6
 661             kwargs['context'] = self._context
 662         if hasattr(self, '_check_hostname'):  # python 3.x
 663             kwargs['check_hostname'] = self._check_hostname
 664         return self.do_open(functools.partial(
 665             _create_http_connection, self, self._https_conn_class, True),
 666             req, **kwargs)
 667
 668
 669 def parse_iso8601(date_str, delimiter='T', timezone=None):
 670     """ Return a UNIX timestamp from the given date """
 671
 672     if date_str is None:
 673         return None
 674
 675     if timezone is None:
 676         m = re.search(
 677             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 678             date_str)
 679         if not m:
 680             timezone = datetime.timedelta()
 681         else:
 682             date_str = date_str[:-len(m.group(0))]
 683             if not m.group('sign'):
 684                 timezone = datetime.timedelta()
 685             else:
 686                 sign = 1 if m.group('sign') == '+' else -1
 687                 timezone = datetime.timedelta(
 688                     hours=sign * int(m.group('hours')),
 689                     minutes=sign * int(m.group('minutes')))
 690     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 691     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 692     return calendar.timegm(dt.timetuple())
 693
 694
 695 def unified_strdate(date_str, day_first=True):
 696     """Return a string with the date in the format YYYYMMDD"""
 697
 698     if date_str is None:
 699         return None
 700     upload_date = None
 701     # Replace commas
 702     date_str = date_str.replace(',', ' ')
 703     # %z (UTC offset) is only supported in python>=3.2
 704     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 705     # Remove AM/PM + timezone
 706     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 707
 708     format_expressions = [
 709         '%d %B %Y',
 710         '%d %b %Y',
 711         '%B %d %Y',
 712         '%b %d %Y',
 713         '%b %dst %Y %I:%M%p',
 714         '%b %dnd %Y %I:%M%p',
 715         '%b %dth %Y %I:%M%p',
 716         '%Y %m %d',
 717         '%Y-%m-%d',
 718         '%Y/%m/%d',
 719         '%Y/%m/%d %H:%M:%S',
 720         '%Y-%m-%d %H:%M:%S',
 721         '%Y-%m-%d %H:%M:%S.%f',
 722         '%d.%m.%Y %H:%M',
 723         '%d.%m.%Y %H.%M',
 724         '%Y-%m-%dT%H:%M:%SZ',
 725         '%Y-%m-%dT%H:%M:%S.%fZ',
 726         '%Y-%m-%dT%H:%M:%S.%f0Z',
 727         '%Y-%m-%dT%H:%M:%S',
 728         '%Y-%m-%dT%H:%M:%S.%f',
 729         '%Y-%m-%dT%H:%M',
 730     ]
 731     if day_first:
 732         format_expressions.extend([
 733             '%d.%m.%Y',
 734             '%d/%m/%Y',
 735             '%d/%m/%y',
 736             '%d/%m/%Y %H:%M:%S',
 737         ])
 738     else:
 739         format_expressions.extend([
 740             '%m.%d.%Y',
 741             '%m/%d/%Y',
 742             '%m/%d/%y',
 743             '%m/%d/%Y %H:%M:%S',
 744         ])
 745     for expression in format_expressions:
 746         try:
 747             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 748         except ValueError:
 749             pass
 750     if upload_date is None:
 751         timetuple = email.utils.parsedate_tz(date_str)
 752         if timetuple:
 753             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 754     return upload_date
 755
 756
 757 def determine_ext(url, default_ext='unknown_video'):
 758     if url is None:
 759         return default_ext
 760     guess = url.partition('?')[0].rpartition('.')[2]
 761     if re.match(r'^[A-Za-z0-9]+$', guess):
 762         return guess
 763     else:
 764         return default_ext
 765
 766
 767 def subtitles_filename(filename, sub_lang, sub_format):
 768     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 769
 770
 771 def date_from_str(date_str):
 772     """
 773     Return a datetime object from a string in the format YYYYMMDD or
 774     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 775     today = datetime.date.today()
 776     if date_str in ('now', 'today'):
 777         return today
 778     if date_str == 'yesterday':
 779         return today - datetime.timedelta(days=1)
 780     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 781     if match is not None:
 782         sign = match.group('sign')
 783         time = int(match.group('time'))
 784         if sign == '-':
 785             time = -time
 786         unit = match.group('unit')
 787         # A bad aproximation?
 788         if unit == 'month':
 789             unit = 'day'
 790             time *= 30
 791         elif unit == 'year':
 792             unit = 'day'
 793             time *= 365
 794         unit += 's'
 795         delta = datetime.timedelta(**{unit: time})
 796         return today + delta
 797     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 798
 799
 800 def hyphenate_date(date_str):
 801     """
 802     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 803     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 804     if match is not None:
 805         return '-'.join(match.groups())
 806     else:
 807         return date_str
 808
 809
 810 class DateRange(object):
 811     """Represents a time interval between two dates"""
 812
 813     def __init__(self, start=None, end=None):
 814         """start and end must be strings in the format accepted by date"""
 815         if start is not None:
 816             self.start = date_from_str(start)
 817         else:
 818             self.start = datetime.datetime.min.date()
 819         if end is not None:
 820             self.end = date_from_str(end)
 821         else:
 822             self.end = datetime.datetime.max.date()
 823         if self.start > self.end:
 824             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 825
 826     @classmethod
 827     def day(cls, day):
 828         """Returns a range that only contains the given day"""
 829         return cls(day, day)
 830
 831     def __contains__(self, date):
 832         """Check if the date is in the range"""
 833         if not isinstance(date, datetime.date):
 834             date = date_from_str(date)
 835         return self.start <= date <= self.end
 836
 837     def __str__(self):
 838         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 839
 840
 841 def platform_name():
 842     """ Returns the platform name as a compat_str """
 843     res = platform.platform()
 844     if isinstance(res, bytes):
 845         res = res.decode(preferredencoding())
 846
 847     assert isinstance(res, compat_str)
 848     return res
 849
 850
 851 def _windows_write_string(s, out):
 852     """ Returns True if the string was written using special methods,
 853     False if it has yet to be written out."""
 854     # Adapted from http://stackoverflow.com/a/3259271/35070
 855
 856     import ctypes
 857     import ctypes.wintypes
 858
 859     WIN_OUTPUT_IDS = {
 860         1: -11,
 861         2: -12,
 862     }
 863
 864     try:
 865         fileno = out.fileno()
 866     except AttributeError:
 867         # If the output stream doesn't have a fileno, it's virtual
 868         return False
 869     except io.UnsupportedOperation:
 870         # Some strange Windows pseudo files?
 871         return False
 872     if fileno not in WIN_OUTPUT_IDS:
 873         return False
 874
 875     GetStdHandle = ctypes.WINFUNCTYPE(
 876         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 877         (b"GetStdHandle", ctypes.windll.kernel32))
 878     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 879
 880     WriteConsoleW = ctypes.WINFUNCTYPE(
 881         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 882         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 883         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 884     written = ctypes.wintypes.DWORD(0)
 885
 886     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 887     FILE_TYPE_CHAR = 0x0002
 888     FILE_TYPE_REMOTE = 0x8000
 889     GetConsoleMode = ctypes.WINFUNCTYPE(
 890         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 891         ctypes.POINTER(ctypes.wintypes.DWORD))(
 892         (b"GetConsoleMode", ctypes.windll.kernel32))
 893     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 894
 895     def not_a_console(handle):
 896         if handle == INVALID_HANDLE_VALUE or handle is None:
 897             return True
 898         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 899                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 900
 901     if not_a_console(h):
 902         return False
 903
 904     def next_nonbmp_pos(s):
 905         try:
 906             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 907         except StopIteration:
 908             return len(s)
 909
 910     while s:
 911         count = min(next_nonbmp_pos(s), 1024)
 912
 913         ret = WriteConsoleW(
 914             h, s, count if count else 2, ctypes.byref(written), None)
 915         if ret == 0:
 916             raise OSError('Failed to write string')
 917         if not count:  # We just wrote a non-BMP character
 918             assert written.value == 2
 919             s = s[1:]
 920         else:
 921             assert written.value > 0
 922             s = s[written.value:]
 923     return True
 924
 925
 926 def write_string(s, out=None, encoding=None):
 927     if out is None:
 928         out = sys.stderr
 929     assert type(s) == compat_str
 930
 931     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 932         if _windows_write_string(s, out):
 933             return
 934
 935     if ('b' in getattr(out, 'mode', '') or
 936             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 937         byt = s.encode(encoding or preferredencoding(), 'ignore')
 938         out.write(byt)
 939     elif hasattr(out, 'buffer'):
 940         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 941         byt = s.encode(enc, 'ignore')
 942         out.buffer.write(byt)
 943     else:
 944         out.write(s)
 945     out.flush()
 946
 947
 948 def bytes_to_intlist(bs):
 949     if not bs:
 950         return []
 951     if isinstance(bs[0], int):  # Python 3
 952         return list(bs)
 953     else:
 954         return [ord(c) for c in bs]
 955
 956
 957 def intlist_to_bytes(xs):
 958     if not xs:
 959         return b''
 960     return struct_pack('%dB' % len(xs), *xs)
 961
 962
 963 # Cross-platform file locking
 964 if sys.platform == 'win32':
 965     import ctypes.wintypes
 966     import msvcrt
 967
 968     class OVERLAPPED(ctypes.Structure):
 969         _fields_ = [
 970             ('Internal', ctypes.wintypes.LPVOID),
 971             ('InternalHigh', ctypes.wintypes.LPVOID),
 972             ('Offset', ctypes.wintypes.DWORD),
 973             ('OffsetHigh', ctypes.wintypes.DWORD),
 974             ('hEvent', ctypes.wintypes.HANDLE),
 975         ]
 976
 977     kernel32 = ctypes.windll.kernel32
 978     LockFileEx = kernel32.LockFileEx
 979     LockFileEx.argtypes = [
 980         ctypes.wintypes.HANDLE,     # hFile
 981         ctypes.wintypes.DWORD,      # dwFlags
 982         ctypes.wintypes.DWORD,      # dwReserved
 983         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 984         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 985         ctypes.POINTER(OVERLAPPED)  # Overlapped
 986     ]
 987     LockFileEx.restype = ctypes.wintypes.BOOL
 988     UnlockFileEx = kernel32.UnlockFileEx
 989     UnlockFileEx.argtypes = [
 990         ctypes.wintypes.HANDLE,     # hFile
 991         ctypes.wintypes.DWORD,      # dwReserved
 992         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 993         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 994         ctypes.POINTER(OVERLAPPED)  # Overlapped
 995     ]
 996     UnlockFileEx.restype = ctypes.wintypes.BOOL
 997     whole_low = 0xffffffff
 998     whole_high = 0x7fffffff
 999
1000     def _lock_file(f, exclusive):
1001         overlapped = OVERLAPPED()
1002         overlapped.Offset = 0
1003         overlapped.OffsetHigh = 0
1004         overlapped.hEvent = 0
1005         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1006         handle = msvcrt.get_osfhandle(f.fileno())
1007         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1008                           whole_low, whole_high, f._lock_file_overlapped_p):
1009             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1010
1011     def _unlock_file(f):
1012         assert f._lock_file_overlapped_p
1013         handle = msvcrt.get_osfhandle(f.fileno())
1014         if not UnlockFileEx(handle, 0,
1015                             whole_low, whole_high, f._lock_file_overlapped_p):
1016             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1017
1018 else:
1019     import fcntl
1020
1021     def _lock_file(f, exclusive):
1022         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1023
1024     def _unlock_file(f):
1025         fcntl.flock(f, fcntl.LOCK_UN)
1026
1027
1028 class locked_file(object):
1029     def __init__(self, filename, mode, encoding=None):
1030         assert mode in ['r', 'a', 'w']
1031         self.f = io.open(filename, mode, encoding=encoding)
1032         self.mode = mode
1033
1034     def __enter__(self):
1035         exclusive = self.mode != 'r'
1036         try:
1037             _lock_file(self.f, exclusive)
1038         except IOError:
1039             self.f.close()
1040             raise
1041         return self
1042
1043     def __exit__(self, etype, value, traceback):
1044         try:
1045             _unlock_file(self.f)
1046         finally:
1047             self.f.close()
1048
1049     def __iter__(self):
1050         return iter(self.f)
1051
1052     def write(self, *args):
1053         return self.f.write(*args)
1054
1055     def read(self, *args):
1056         return self.f.read(*args)
1057
1058
1059 def get_filesystem_encoding():
1060     encoding = sys.getfilesystemencoding()
1061     return encoding if encoding is not None else 'utf-8'
1062
1063
1064 def shell_quote(args):
1065     quoted_args = []
1066     encoding = get_filesystem_encoding()
1067     for a in args:
1068         if isinstance(a, bytes):
1069             # We may get a filename encoded with 'encodeFilename'
1070             a = a.decode(encoding)
1071         quoted_args.append(pipes.quote(a))
1072     return ' '.join(quoted_args)
1073
1074
1075 def takewhile_inclusive(pred, seq):
1076     """ Like itertools.takewhile, but include the latest evaluated element
1077         (the first element so that Not pred(e)) """
1078     for e in seq:
1079         yield e
1080         if not pred(e):
1081             return
1082
1083
1084 def smuggle_url(url, data):
1085     """ Pass additional data in a URL for internal use. """
1086
1087     sdata = compat_urllib_parse.urlencode(
1088         {'__youtubedl_smuggle': json.dumps(data)})
1089     return url + '#' + sdata
1090
1091
1092 def unsmuggle_url(smug_url, default=None):
1093     if '#__youtubedl_smuggle' not in smug_url:
1094         return smug_url, default
1095     url, _, sdata = smug_url.rpartition('#')
1096     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1097     data = json.loads(jsond)
1098     return url, data
1099
1100
1101 def format_bytes(bytes):
1102     if bytes is None:
1103         return 'N/A'
1104     if type(bytes) is str:
1105         bytes = float(bytes)
1106     if bytes == 0.0:
1107         exponent = 0
1108     else:
1109         exponent = int(math.log(bytes, 1024.0))
1110     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1111     converted = float(bytes) / float(1024 ** exponent)
1112     return '%.2f%s' % (converted, suffix)
1113
1114
1115 def parse_filesize(s):
1116     if s is None:
1117         return None
1118
1119     # The lower-case forms are of course incorrect and inofficial,
1120     # but we support those too
1121     _UNIT_TABLE = {
1122         'B': 1,
1123         'b': 1,
1124         'KiB': 1024,
1125         'KB': 1000,
1126         'kB': 1024,
1127         'Kb': 1000,
1128         'MiB': 1024 ** 2,
1129         'MB': 1000 ** 2,
1130         'mB': 1024 ** 2,
1131         'Mb': 1000 ** 2,
1132         'GiB': 1024 ** 3,
1133         'GB': 1000 ** 3,
1134         'gB': 1024 ** 3,
1135         'Gb': 1000 ** 3,
1136         'TiB': 1024 ** 4,
1137         'TB': 1000 ** 4,
1138         'tB': 1024 ** 4,
1139         'Tb': 1000 ** 4,
1140         'PiB': 1024 ** 5,
1141         'PB': 1000 ** 5,
1142         'pB': 1024 ** 5,
1143         'Pb': 1000 ** 5,
1144         'EiB': 1024 ** 6,
1145         'EB': 1000 ** 6,
1146         'eB': 1024 ** 6,
1147         'Eb': 1000 ** 6,
1148         'ZiB': 1024 ** 7,
1149         'ZB': 1000 ** 7,
1150         'zB': 1024 ** 7,
1151         'Zb': 1000 ** 7,
1152         'YiB': 1024 ** 8,
1153         'YB': 1000 ** 8,
1154         'yB': 1024 ** 8,
1155         'Yb': 1000 ** 8,
1156     }
1157
1158     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1159     m = re.match(
1160         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1161     if not m:
1162         return None
1163
1164     num_str = m.group('num').replace(',', '.')
1165     mult = _UNIT_TABLE[m.group('unit')]
1166     return int(float(num_str) * mult)
1167
1168
1169 def get_term_width():
1170     columns = compat_getenv('COLUMNS', None)
1171     if columns:
1172         return int(columns)
1173
1174     try:
1175         sp = subprocess.Popen(
1176             ['stty', 'size'],
1177             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1178         out, err = sp.communicate()
1179         return int(out.split()[1])
1180     except:
1181         pass
1182     return None
1183
1184
1185 def month_by_name(name):
1186     """ Return the number of a month by (locale-independently) English name """
1187
1188     ENGLISH_NAMES = [
1189         'January', 'February', 'March', 'April', 'May', 'June',
1190         'July', 'August', 'September', 'October', 'November', 'December']
1191     try:
1192         return ENGLISH_NAMES.index(name) + 1
1193     except ValueError:
1194         return None
1195
1196
1197 def fix_xml_ampersands(xml_str):
1198     """Replace all the '&' by '&amp;' in XML"""
1199     return re.sub(
1200         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1201         '&amp;',
1202         xml_str)
1203
1204
1205 def setproctitle(title):
1206     assert isinstance(title, compat_str)
1207     try:
1208         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1209     except OSError:
1210         return
1211     title_bytes = title.encode('utf-8')
1212     buf = ctypes.create_string_buffer(len(title_bytes))
1213     buf.value = title_bytes
1214     try:
1215         libc.prctl(15, buf, 0, 0, 0)
1216     except AttributeError:
1217         return  # Strange libc, just skip this
1218
1219
1220 def remove_start(s, start):
1221     if s.startswith(start):
1222         return s[len(start):]
1223     return s
1224
1225
1226 def remove_end(s, end):
1227     if s.endswith(end):
1228         return s[:-len(end)]
1229     return s
1230
1231
1232 def url_basename(url):
1233     path = compat_urlparse.urlparse(url).path
1234     return path.strip('/').split('/')[-1]
1235
1236
1237 class HEADRequest(compat_urllib_request.Request):
1238     def get_method(self):
1239         return "HEAD"
1240
1241
1242 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1243     if get_attr:
1244         if v is not None:
1245             v = getattr(v, get_attr, None)
1246     if v == '':
1247         v = None
1248     return default if v is None else (int(v) * invscale // scale)
1249
1250
1251 def str_or_none(v, default=None):
1252     return default if v is None else compat_str(v)
1253
1254
1255 def str_to_int(int_str):
1256     """ A more relaxed version of int_or_none """
1257     if int_str is None:
1258         return None
1259     int_str = re.sub(r'[,\.\+]', '', int_str)
1260     return int(int_str)
1261
1262
1263 def float_or_none(v, scale=1, invscale=1, default=None):
1264     return default if v is None else (float(v) * invscale / scale)
1265
1266
1267 def parse_duration(s):
1268     if not isinstance(s, compat_basestring):
1269         return None
1270
1271     s = s.strip()
1272
1273     m = re.match(
1274         r'''(?ix)(?:P?T)?
1275         (?:
1276             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1277             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1278
1279             (?:
1280                 (?:
1281                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1282                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1283                 )?
1284                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1285             )?
1286             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1287         )$''', s)
1288     if not m:
1289         return None
1290     res = 0
1291     if m.group('only_mins'):
1292         return float_or_none(m.group('only_mins'), invscale=60)
1293     if m.group('only_hours'):
1294         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1295     if m.group('secs'):
1296         res += int(m.group('secs'))
1297     if m.group('mins'):
1298         res += int(m.group('mins')) * 60
1299     if m.group('hours'):
1300         res += int(m.group('hours')) * 60 * 60
1301     if m.group('days'):
1302         res += int(m.group('days')) * 24 * 60 * 60
1303     if m.group('ms'):
1304         res += float(m.group('ms'))
1305     return res
1306
1307
1308 def prepend_extension(filename, ext):
1309     name, real_ext = os.path.splitext(filename)
1310     return '{0}.{1}{2}'.format(name, ext, real_ext)
1311
1312
1313 def check_executable(exe, args=[]):
1314     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1315     args can be a list of arguments for a short output (like -version) """
1316     try:
1317         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1318     except OSError:
1319         return False
1320     return exe
1321
1322
1323 def get_exe_version(exe, args=['--version'],
1324                     version_re=None, unrecognized='present'):
1325     """ Returns the version of the specified executable,
1326     or False if the executable is not present """
1327     try:
1328         out, _ = subprocess.Popen(
1329             [exe] + args,
1330             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1331     except OSError:
1332         return False
1333     if isinstance(out, bytes):  # Python 2.x
1334         out = out.decode('ascii', 'ignore')
1335     return detect_exe_version(out, version_re, unrecognized)
1336
1337
1338 def detect_exe_version(output, version_re=None, unrecognized='present'):
1339     assert isinstance(output, compat_str)
1340     if version_re is None:
1341         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1342     m = re.search(version_re, output)
1343     if m:
1344         return m.group(1)
1345     else:
1346         return unrecognized
1347
1348
1349 class PagedList(object):
1350     def __len__(self):
1351         # This is only useful for tests
1352         return len(self.getslice())
1353
1354
1355 class OnDemandPagedList(PagedList):
1356     def __init__(self, pagefunc, pagesize):
1357         self._pagefunc = pagefunc
1358         self._pagesize = pagesize
1359
1360     def getslice(self, start=0, end=None):
1361         res = []
1362         for pagenum in itertools.count(start // self._pagesize):
1363             firstid = pagenum * self._pagesize
1364             nextfirstid = pagenum * self._pagesize + self._pagesize
1365             if start >= nextfirstid:
1366                 continue
1367
1368             page_results = list(self._pagefunc(pagenum))
1369
1370             startv = (
1371                 start % self._pagesize
1372                 if firstid <= start < nextfirstid
1373                 else 0)
1374
1375             endv = (
1376                 ((end - 1) % self._pagesize) + 1
1377                 if (end is not None and firstid <= end <= nextfirstid)
1378                 else None)
1379
1380             if startv != 0 or endv is not None:
1381                 page_results = page_results[startv:endv]
1382             res.extend(page_results)
1383
1384             # A little optimization - if current page is not "full", ie. does
1385             # not contain page_size videos then we can assume that this page
1386             # is the last one - there are no more ids on further pages -
1387             # i.e. no need to query again.
1388             if len(page_results) + startv < self._pagesize:
1389                 break
1390
1391             # If we got the whole page, but the next page is not interesting,
1392             # break out early as well
1393             if end == nextfirstid:
1394                 break
1395         return res
1396
1397
1398 class InAdvancePagedList(PagedList):
1399     def __init__(self, pagefunc, pagecount, pagesize):
1400         self._pagefunc = pagefunc
1401         self._pagecount = pagecount
1402         self._pagesize = pagesize
1403
1404     def getslice(self, start=0, end=None):
1405         res = []
1406         start_page = start // self._pagesize
1407         end_page = (
1408             self._pagecount if end is None else (end // self._pagesize + 1))
1409         skip_elems = start - start_page * self._pagesize
1410         only_more = None if end is None else end - start
1411         for pagenum in range(start_page, end_page):
1412             page = list(self._pagefunc(pagenum))
1413             if skip_elems:
1414                 page = page[skip_elems:]
1415                 skip_elems = None
1416             if only_more is not None:
1417                 if len(page) < only_more:
1418                     only_more -= len(page)
1419                 else:
1420                     page = page[:only_more]
1421                     res.extend(page)
1422                     break
1423             res.extend(page)
1424         return res
1425
1426
1427 def uppercase_escape(s):
1428     unicode_escape = codecs.getdecoder('unicode_escape')
1429     return re.sub(
1430         r'\\U[0-9a-fA-F]{8}',
1431         lambda m: unicode_escape(m.group(0))[0],
1432         s)
1433
1434
1435 def escape_rfc3986(s):
1436     """Escape non-ASCII characters as suggested by RFC 3986"""
1437     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1438         s = s.encode('utf-8')
1439     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1440
1441
1442 def escape_url(url):
1443     """Escape URL as suggested by RFC 3986"""
1444     url_parsed = compat_urllib_parse_urlparse(url)
1445     return url_parsed._replace(
1446         path=escape_rfc3986(url_parsed.path),
1447         params=escape_rfc3986(url_parsed.params),
1448         query=escape_rfc3986(url_parsed.query),
1449         fragment=escape_rfc3986(url_parsed.fragment)
1450     ).geturl()
1451
1452 try:
1453     struct.pack('!I', 0)
1454 except TypeError:
1455     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1456     def struct_pack(spec, *args):
1457         if isinstance(spec, compat_str):
1458             spec = spec.encode('ascii')
1459         return struct.pack(spec, *args)
1460
1461     def struct_unpack(spec, *args):
1462         if isinstance(spec, compat_str):
1463             spec = spec.encode('ascii')
1464         return struct.unpack(spec, *args)
1465 else:
1466     struct_pack = struct.pack
1467     struct_unpack = struct.unpack
1468
1469
1470 def read_batch_urls(batch_fd):
1471     def fixup(url):
1472         if not isinstance(url, compat_str):
1473             url = url.decode('utf-8', 'replace')
1474         BOM_UTF8 = '\xef\xbb\xbf'
1475         if url.startswith(BOM_UTF8):
1476             url = url[len(BOM_UTF8):]
1477         url = url.strip()
1478         if url.startswith(('#', ';', ']')):
1479             return False
1480         return url
1481
1482     with contextlib.closing(batch_fd) as fd:
1483         return [url for url in map(fixup, fd) if url]
1484
1485
1486 def urlencode_postdata(*args, **kargs):
1487     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1488
1489
1490 try:
1491     etree_iter = xml.etree.ElementTree.Element.iter
1492 except AttributeError:  # Python <=2.6
1493     etree_iter = lambda n: n.findall('.//*')
1494
1495
1496 def parse_xml(s):
1497     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1498         def doctype(self, name, pubid, system):
1499             pass  # Ignore doctypes
1500
1501     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1502     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1503     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1504     # Fix up XML parser in Python 2.x
1505     if sys.version_info < (3, 0):
1506         for n in etree_iter(tree):
1507             if n.text is not None:
1508                 if not isinstance(n.text, compat_str):
1509                     n.text = n.text.decode('utf-8')
1510     return tree
1511
1512
1513 US_RATINGS = {
1514     'G': 0,
1515     'PG': 10,
1516     'PG-13': 13,
1517     'R': 16,
1518     'NC': 18,
1519 }
1520
1521
1522 def parse_age_limit(s):
1523     if s is None:
1524         return None
1525     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1526     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1527
1528
1529 def strip_jsonp(code):
1530     return re.sub(
1531         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1532
1533
1534 def js_to_json(code):
1535     def fix_kv(m):
1536         v = m.group(0)
1537         if v in ('true', 'false', 'null'):
1538             return v
1539         if v.startswith('"'):
1540             return v
1541         if v.startswith("'"):
1542             v = v[1:-1]
1543             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1544                 '\\\\': '\\\\',
1545                 "\\'": "'",
1546                 '"': '\\"',
1547             }[m.group(0)], v)
1548         return '"%s"' % v
1549
1550     res = re.sub(r'''(?x)
1551         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1552         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1553         [a-zA-Z_][.a-zA-Z_0-9]*
1554         ''', fix_kv, code)
1555     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1556     return res
1557
1558
1559 def qualities(quality_ids):
1560     """ Get a numeric quality value out of a list of possible values """
1561     def q(qid):
1562         try:
1563             return quality_ids.index(qid)
1564         except ValueError:
1565             return -1
1566     return q
1567
1568
1569 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1570
1571
1572 def limit_length(s, length):
1573     """ Add ellipses to overly long strings """
1574     if s is None:
1575         return None
1576     ELLIPSES = '...'
1577     if len(s) > length:
1578         return s[:length - len(ELLIPSES)] + ELLIPSES
1579     return s
1580
1581
1582 def version_tuple(v):
1583     return tuple(int(e) for e in re.split(r'[-.]', v))
1584
1585
1586 def is_outdated_version(version, limit, assume_new=True):
1587     if not version:
1588         return not assume_new
1589     try:
1590         return version_tuple(version) < version_tuple(limit)
1591     except ValueError:
1592         return not assume_new
1593
1594
1595 def ytdl_is_updateable():
1596     """ Returns if youtube-dl can be updated with -U """
1597     from zipimport import zipimporter
1598
1599     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1600
1601
1602 def args_to_str(args):
1603     # Get a short string representation for a subprocess command
1604     return ' '.join(shlex_quote(a) for a in args)
1605
1606
1607 def urlhandle_detect_ext(url_handle):
1608     try:
1609         url_handle.headers
1610         getheader = lambda h: url_handle.headers[h]
1611     except AttributeError:  # Python < 3
1612         getheader = url_handle.info().getheader
1613
1614     cd = getheader('Content-Disposition')
1615     if cd:
1616         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1617         if m:
1618             e = determine_ext(m.group('filename'), default_ext=None)
1619             if e:
1620                 return e
1621
1622     return getheader('Content-Type').split("/")[1]
1623
1624
1625 def age_restricted(content_limit, age_limit):
1626     """ Returns True iff the content should be blocked """
1627
1628     if age_limit is None:  # No limit set
1629         return False
1630     if content_limit is None:
1631         return False  # Content available for everyone
1632     return age_limit < content_limit
1633
1634
1635 def is_html(first_bytes):
1636     """ Detect whether a file contains HTML by examining its first bytes. """
1637
1638     BOMS = [
1639         (b'\xef\xbb\xbf', 'utf-8'),
1640         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1641         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1642         (b'\xff\xfe', 'utf-16-le'),
1643         (b'\xfe\xff', 'utf-16-be'),
1644     ]
1645     for bom, enc in BOMS:
1646         if first_bytes.startswith(bom):
1647             s = first_bytes[len(bom):].decode(enc, 'replace')
1648             break
1649     else:
1650         s = first_bytes.decode('utf-8', 'replace')
1651
1652     return re.match(r'^\s*<', s)
1653
1654
1655 def determine_protocol(info_dict):
1656     protocol = info_dict.get('protocol')
1657     if protocol is not None:
1658         return protocol
1659
1660     url = info_dict['url']
1661     if url.startswith('rtmp'):
1662         return 'rtmp'
1663     elif url.startswith('mms'):
1664         return 'mms'
1665     elif url.startswith('rtsp'):
1666         return 'rtsp'
1667
1668     ext = determine_ext(url)
1669     if ext == 'm3u8':
1670         return 'm3u8'
1671     elif ext == 'f4m':
1672         return 'f4m'
1673
1674     return compat_urllib_parse_urlparse(url).scheme
1675
1676
1677 def render_table(header_row, data):
1678     """ Render a list of rows, each as a list of values """
1679     table = [header_row] + data
1680     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1681     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1682     return '\n'.join(format_str % tuple(row) for row in table)
1683
1684
1685 def _match_one(filter_part, dct):
1686     COMPARISON_OPERATORS = {
1687         '<': operator.lt,
1688         '<=': operator.le,
1689         '>': operator.gt,
1690         '>=': operator.ge,
1691         '=': operator.eq,
1692         '!=': operator.ne,
1693     }
1694     operator_rex = re.compile(r'''(?x)\s*
1695         (?P<key>[a-z_]+)
1696         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1697         (?:
1698             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1699             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1700         )
1701         \s*$
1702         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1703     m = operator_rex.search(filter_part)
1704     if m:
1705         op = COMPARISON_OPERATORS[m.group('op')]
1706         if m.group('strval') is not None:
1707             if m.group('op') not in ('=', '!='):
1708                 raise ValueError(
1709                     'Operator %s does not support string values!' % m.group('op'))
1710             comparison_value = m.group('strval')
1711         else:
1712             try:
1713                 comparison_value = int(m.group('intval'))
1714             except ValueError:
1715                 comparison_value = parse_filesize(m.group('intval'))
1716                 if comparison_value is None:
1717                     comparison_value = parse_filesize(m.group('intval') + 'B')
1718                 if comparison_value is None:
1719                     raise ValueError(
1720                         'Invalid integer value %r in filter part %r' % (
1721                             m.group('intval'), filter_part))
1722         actual_value = dct.get(m.group('key'))
1723         if actual_value is None:
1724             return m.group('none_inclusive')
1725         return op(actual_value, comparison_value)
1726
1727     UNARY_OPERATORS = {
1728         '': lambda v: v is not None,
1729         '!': lambda v: v is None,
1730     }
1731     operator_rex = re.compile(r'''(?x)\s*
1732         (?P<op>%s)\s*(?P<key>[a-z_]+)
1733         \s*$
1734         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1735     m = operator_rex.search(filter_part)
1736     if m:
1737         op = UNARY_OPERATORS[m.group('op')]
1738         actual_value = dct.get(m.group('key'))
1739         return op(actual_value)
1740
1741     raise ValueError('Invalid filter part %r' % filter_part)
1742
1743
1744 def match_str(filter_str, dct):
1745     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1746
1747     return all(
1748         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1749
1750
1751 def match_filter_func(filter_str):
1752     def _match_func(info_dict):
1753         if match_str(filter_str, info_dict):
1754             return None
1755         else:
1756             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1757             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1758     return _match_func