youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     result = ''.join(map(replace_insane, s))
 291     if not is_id:
 292         while '__' in result:
 293             result = result.replace('__', '_')
 294         result = result.strip('_')
 295         # Common case of "Foreign band name - English song title"
 296         if restricted and result.startswith('-_'):
 297             result = result[2:]
 298         if not result:
 299             result = '_'
 300     return result
 301
 302
 303 def orderedSet(iterable):
 304     """ Remove all duplicates from the input iterable """
 305     res = []
 306     for el in iterable:
 307         if el not in res:
 308             res.append(el)
 309     return res
 310
 311
 312 def _htmlentity_transform(entity):
 313     """Transforms an HTML entity to a character."""
 314     # Known non-numeric HTML entity
 315     if entity in compat_html_entities.name2codepoint:
 316         return compat_chr(compat_html_entities.name2codepoint[entity])
 317
 318     mobj = re.match(r'#(x?[0-9]+)', entity)
 319     if mobj is not None:
 320         numstr = mobj.group(1)
 321         if numstr.startswith('x'):
 322             base = 16
 323             numstr = '0%s' % numstr
 324         else:
 325             base = 10
 326         return compat_chr(int(numstr, base))
 327
 328     # Unknown entity in name, return its literal representation
 329     return ('&%s;' % entity)
 330
 331
 332 def unescapeHTML(s):
 333     if s is None:
 334         return None
 335     assert type(s) == compat_str
 336
 337     return re.sub(
 338         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 339
 340
 341 def encodeFilename(s, for_subprocess=False):
 342     """
 343     @param s The name of the file
 344     """
 345
 346     assert type(s) == compat_str
 347
 348     # Python 3 has a Unicode API
 349     if sys.version_info >= (3, 0):
 350         return s
 351
 352     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 353         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 354         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 355         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 356         if not for_subprocess:
 357             return s
 358         else:
 359             # For subprocess calls, encode with locale encoding
 360             # Refer to http://stackoverflow.com/a/9951851/35070
 361             encoding = preferredencoding()
 362     else:
 363         encoding = sys.getfilesystemencoding()
 364     if encoding is None:
 365         encoding = 'utf-8'
 366     return s.encode(encoding, 'ignore')
 367
 368
 369 def encodeArgument(s):
 370     if not isinstance(s, compat_str):
 371         # Legacy code that uses byte strings
 372         # Uncomment the following line after fixing all post processors
 373         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 374         s = s.decode('ascii')
 375     return encodeFilename(s, True)
 376
 377
 378 def decodeOption(optval):
 379     if optval is None:
 380         return optval
 381     if isinstance(optval, bytes):
 382         optval = optval.decode(preferredencoding())
 383
 384     assert isinstance(optval, compat_str)
 385     return optval
 386
 387
 388 def formatSeconds(secs):
 389     if secs > 3600:
 390         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 391     elif secs > 60:
 392         return '%d:%02d' % (secs // 60, secs % 60)
 393     else:
 394         return '%d' % secs
 395
 396
 397 def make_HTTPS_handler(params, **kwargs):
 398     opts_no_check_certificate = params.get('nocheckcertificate', False)
 399     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 400         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 401         if opts_no_check_certificate:
 402             context.verify_mode = ssl.CERT_NONE
 403         try:
 404             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 405         except TypeError:
 406             # Python 2.7.8
 407             # (create_default_context present but HTTPSHandler has no context=)
 408             pass
 409
 410     if sys.version_info < (3, 2):
 411         import httplib
 412
 413         class HTTPSConnectionV3(httplib.HTTPSConnection):
 414             def __init__(self, *args, **kwargs):
 415                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 416
 417             def connect(self):
 418                 sock = socket.create_connection((self.host, self.port), self.timeout)
 419                 if getattr(self, '_tunnel_host', False):
 420                     self.sock = sock
 421                     self._tunnel()
 422                 try:
 423                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 424                 except ssl.SSLError:
 425                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 426
 427         return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
 428     else:  # Python < 3.4
 429         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 430         context.verify_mode = (ssl.CERT_NONE
 431                                if opts_no_check_certificate
 432                                else ssl.CERT_REQUIRED)
 433         context.set_default_verify_paths()
 434         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 435
 436
 437 class ExtractorError(Exception):
 438     """Error during info extraction."""
 439
 440     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 441         """ tb, if given, is the original traceback (so that it can be printed out).
 442         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 443         """
 444
 445         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 446             expected = True
 447         if video_id is not None:
 448             msg = video_id + ': ' + msg
 449         if cause:
 450             msg += ' (caused by %r)' % cause
 451         if not expected:
 452             if ytdl_is_updateable():
 453                 update_cmd = 'type  youtube-dl -U  to update'
 454             else:
 455                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 456             msg += '; please report this issue on https://yt-dl.org/bug .'
 457             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 458             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 459         super(ExtractorError, self).__init__(msg)
 460
 461         self.traceback = tb
 462         self.exc_info = sys.exc_info()  # preserve original exception
 463         self.cause = cause
 464         self.video_id = video_id
 465
 466     def format_traceback(self):
 467         if self.traceback is None:
 468             return None
 469         return ''.join(traceback.format_tb(self.traceback))
 470
 471
 472 class UnsupportedError(ExtractorError):
 473     def __init__(self, url):
 474         super(UnsupportedError, self).__init__(
 475             'Unsupported URL: %s' % url, expected=True)
 476         self.url = url
 477
 478
 479 class RegexNotFoundError(ExtractorError):
 480     """Error when a regex didn't match"""
 481     pass
 482
 483
 484 class DownloadError(Exception):
 485     """Download Error exception.
 486
 487     This exception may be thrown by FileDownloader objects if they are not
 488     configured to continue on errors. They will contain the appropriate
 489     error message.
 490     """
 491
 492     def __init__(self, msg, exc_info=None):
 493         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 494         super(DownloadError, self).__init__(msg)
 495         self.exc_info = exc_info
 496
 497
 498 class SameFileError(Exception):
 499     """Same File exception.
 500
 501     This exception will be thrown by FileDownloader objects if they detect
 502     multiple files would have to be downloaded to the same file on disk.
 503     """
 504     pass
 505
 506
 507 class PostProcessingError(Exception):
 508     """Post Processing exception.
 509
 510     This exception may be raised by PostProcessor's .run() method to
 511     indicate an error in the postprocessing task.
 512     """
 513
 514     def __init__(self, msg):
 515         self.msg = msg
 516
 517
 518 class MaxDownloadsReached(Exception):
 519     """ --max-downloads limit has been reached. """
 520     pass
 521
 522
 523 class UnavailableVideoError(Exception):
 524     """Unavailable Format exception.
 525
 526     This exception will be thrown when a video is requested
 527     in a format that is not available for that video.
 528     """
 529     pass
 530
 531
 532 class ContentTooShortError(Exception):
 533     """Content Too Short exception.
 534
 535     This exception may be raised by FileDownloader objects when a file they
 536     download is too small for what the server announced first, indicating
 537     the connection was probably interrupted.
 538     """
 539     # Both in bytes
 540     downloaded = None
 541     expected = None
 542
 543     def __init__(self, downloaded, expected):
 544         self.downloaded = downloaded
 545         self.expected = expected
 546
 547
 548 def _create_http_connection(ydl_handler, http_class, is_https=False, *args, **kwargs):
 549     hc = http_class(*args, **kwargs)
 550     source_address = ydl_handler._params.get('source_address')
 551     if source_address is not None:
 552         sa = (source_address, 0)
 553         if hasattr(hc, 'source_address'):  # Python 2.7+
 554             hc.source_address = sa
 555         else:  # Python 2.6
 556             def _hc_connect(self, *args, **kwargs):
 557                 sock = compat_socket_create_connection(
 558                     (self.host, self.port), self.timeout, sa)
 559                 if is_https:
 560                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
 561                 else:
 562                     self.sock = sock
 563             hc.connect = functools.partial(_hc_connect, hc)
 564
 565     return hc
 566
 567
 568 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 569     """Handler for HTTP requests and responses.
 570
 571     This class, when installed with an OpenerDirector, automatically adds
 572     the standard headers to every HTTP request and handles gzipped and
 573     deflated responses from web servers. If compression is to be avoided in
 574     a particular request, the original request in the program code only has
 575     to include the HTTP header "Youtubedl-No-Compression", which will be
 576     removed before making the real request.
 577
 578     Part of this code was copied from:
 579
 580     http://techknack.net/python-urllib2-handlers/
 581
 582     Andrew Rowls, the author of that code, agreed to release it to the
 583     public domain.
 584     """
 585
 586     def __init__(self, params, *args, **kwargs):
 587         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 588         self._params = params
 589
 590     def http_open(self, req):
 591         return self.do_open(functools.partial(
 592             _create_http_connection, self, compat_http_client.HTTPConnection),
 593             req)
 594
 595     @staticmethod
 596     def deflate(data):
 597         try:
 598             return zlib.decompress(data, -zlib.MAX_WBITS)
 599         except zlib.error:
 600             return zlib.decompress(data)
 601
 602     @staticmethod
 603     def addinfourl_wrapper(stream, headers, url, code):
 604         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 605             return compat_urllib_request.addinfourl(stream, headers, url, code)
 606         ret = compat_urllib_request.addinfourl(stream, headers, url)
 607         ret.code = code
 608         return ret
 609
 610     def http_request(self, req):
 611         for h, v in std_headers.items():
 612             if h not in req.headers:
 613                 req.add_header(h, v)
 614         if 'Youtubedl-no-compression' in req.headers:
 615             if 'Accept-encoding' in req.headers:
 616                 del req.headers['Accept-encoding']
 617             del req.headers['Youtubedl-no-compression']
 618         if 'Youtubedl-user-agent' in req.headers:
 619             if 'User-agent' in req.headers:
 620                 del req.headers['User-agent']
 621             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 622             del req.headers['Youtubedl-user-agent']
 623
 624         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 625             # Python 2.6 is brain-dead when it comes to fragments
 626             req._Request__original = req._Request__original.partition('#')[0]
 627             req._Request__r_type = req._Request__r_type.partition('#')[0]
 628
 629         return req
 630
 631     def http_response(self, req, resp):
 632         old_resp = resp
 633         # gzip
 634         if resp.headers.get('Content-encoding', '') == 'gzip':
 635             content = resp.read()
 636             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 637             try:
 638                 uncompressed = io.BytesIO(gz.read())
 639             except IOError as original_ioerror:
 640                 # There may be junk add the end of the file
 641                 # See http://stackoverflow.com/q/4928560/35070 for details
 642                 for i in range(1, 1024):
 643                     try:
 644                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 645                         uncompressed = io.BytesIO(gz.read())
 646                     except IOError:
 647                         continue
 648                     break
 649                 else:
 650                     raise original_ioerror
 651             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 652             resp.msg = old_resp.msg
 653         # deflate
 654         if resp.headers.get('Content-encoding', '') == 'deflate':
 655             gz = io.BytesIO(self.deflate(resp.read()))
 656             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 657             resp.msg = old_resp.msg
 658         return resp
 659
 660     https_request = http_request
 661     https_response = http_response
 662
 663
 664 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 665     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 666         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 667         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 668         self._params = params
 669
 670     def https_open(self, req):
 671         return self.do_open(functools.partial(
 672             _create_http_connection, self, self._https_conn_class, True),
 673             req)
 674
 675
 676 def parse_iso8601(date_str, delimiter='T'):
 677     """ Return a UNIX timestamp from the given date """
 678
 679     if date_str is None:
 680         return None
 681
 682     m = re.search(
 683         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 684         date_str)
 685     if not m:
 686         timezone = datetime.timedelta()
 687     else:
 688         date_str = date_str[:-len(m.group(0))]
 689         if not m.group('sign'):
 690             timezone = datetime.timedelta()
 691         else:
 692             sign = 1 if m.group('sign') == '+' else -1
 693             timezone = datetime.timedelta(
 694                 hours=sign * int(m.group('hours')),
 695                 minutes=sign * int(m.group('minutes')))
 696     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 697     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 698     return calendar.timegm(dt.timetuple())
 699
 700
 701 def unified_strdate(date_str, day_first=True):
 702     """Return a string with the date in the format YYYYMMDD"""
 703
 704     if date_str is None:
 705         return None
 706     upload_date = None
 707     # Replace commas
 708     date_str = date_str.replace(',', ' ')
 709     # %z (UTC offset) is only supported in python>=3.2
 710     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 711     # Remove AM/PM + timezone
 712     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 713
 714     format_expressions = [
 715         '%d %B %Y',
 716         '%d %b %Y',
 717         '%B %d %Y',
 718         '%b %d %Y',
 719         '%b %dst %Y %I:%M%p',
 720         '%b %dnd %Y %I:%M%p',
 721         '%b %dth %Y %I:%M%p',
 722         '%Y-%m-%d',
 723         '%Y/%m/%d',
 724         '%Y/%m/%d %H:%M:%S',
 725         '%Y-%m-%d %H:%M:%S',
 726         '%Y-%m-%d %H:%M:%S.%f',
 727         '%d.%m.%Y %H:%M',
 728         '%d.%m.%Y %H.%M',
 729         '%Y-%m-%dT%H:%M:%SZ',
 730         '%Y-%m-%dT%H:%M:%S.%fZ',
 731         '%Y-%m-%dT%H:%M:%S.%f0Z',
 732         '%Y-%m-%dT%H:%M:%S',
 733         '%Y-%m-%dT%H:%M:%S.%f',
 734         '%Y-%m-%dT%H:%M',
 735     ]
 736     if day_first:
 737         format_expressions.extend([
 738             '%d.%m.%Y',
 739             '%d/%m/%Y',
 740             '%d/%m/%y',
 741             '%d/%m/%Y %H:%M:%S',
 742         ])
 743     else:
 744         format_expressions.extend([
 745             '%m.%d.%Y',
 746             '%m/%d/%Y',
 747             '%m/%d/%y',
 748             '%m/%d/%Y %H:%M:%S',
 749         ])
 750     for expression in format_expressions:
 751         try:
 752             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 753         except ValueError:
 754             pass
 755     if upload_date is None:
 756         timetuple = email.utils.parsedate_tz(date_str)
 757         if timetuple:
 758             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 759     return upload_date
 760
 761
 762 def determine_ext(url, default_ext='unknown_video'):
 763     if url is None:
 764         return default_ext
 765     guess = url.partition('?')[0].rpartition('.')[2]
 766     if re.match(r'^[A-Za-z0-9]+$', guess):
 767         return guess
 768     else:
 769         return default_ext
 770
 771
 772 def subtitles_filename(filename, sub_lang, sub_format):
 773     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 774
 775
 776 def date_from_str(date_str):
 777     """
 778     Return a datetime object from a string in the format YYYYMMDD or
 779     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 780     today = datetime.date.today()
 781     if date_str in ('now', 'today'):
 782         return today
 783     if date_str == 'yesterday':
 784         return today - datetime.timedelta(days=1)
 785     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 786     if match is not None:
 787         sign = match.group('sign')
 788         time = int(match.group('time'))
 789         if sign == '-':
 790             time = -time
 791         unit = match.group('unit')
 792         # A bad aproximation?
 793         if unit == 'month':
 794             unit = 'day'
 795             time *= 30
 796         elif unit == 'year':
 797             unit = 'day'
 798             time *= 365
 799         unit += 's'
 800         delta = datetime.timedelta(**{unit: time})
 801         return today + delta
 802     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 803
 804
 805 def hyphenate_date(date_str):
 806     """
 807     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 808     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 809     if match is not None:
 810         return '-'.join(match.groups())
 811     else:
 812         return date_str
 813
 814
 815 class DateRange(object):
 816     """Represents a time interval between two dates"""
 817
 818     def __init__(self, start=None, end=None):
 819         """start and end must be strings in the format accepted by date"""
 820         if start is not None:
 821             self.start = date_from_str(start)
 822         else:
 823             self.start = datetime.datetime.min.date()
 824         if end is not None:
 825             self.end = date_from_str(end)
 826         else:
 827             self.end = datetime.datetime.max.date()
 828         if self.start > self.end:
 829             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 830
 831     @classmethod
 832     def day(cls, day):
 833         """Returns a range that only contains the given day"""
 834         return cls(day, day)
 835
 836     def __contains__(self, date):
 837         """Check if the date is in the range"""
 838         if not isinstance(date, datetime.date):
 839             date = date_from_str(date)
 840         return self.start <= date <= self.end
 841
 842     def __str__(self):
 843         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 844
 845
 846 def platform_name():
 847     """ Returns the platform name as a compat_str """
 848     res = platform.platform()
 849     if isinstance(res, bytes):
 850         res = res.decode(preferredencoding())
 851
 852     assert isinstance(res, compat_str)
 853     return res
 854
 855
 856 def _windows_write_string(s, out):
 857     """ Returns True if the string was written using special methods,
 858     False if it has yet to be written out."""
 859     # Adapted from http://stackoverflow.com/a/3259271/35070
 860
 861     import ctypes
 862     import ctypes.wintypes
 863
 864     WIN_OUTPUT_IDS = {
 865         1: -11,
 866         2: -12,
 867     }
 868
 869     try:
 870         fileno = out.fileno()
 871     except AttributeError:
 872         # If the output stream doesn't have a fileno, it's virtual
 873         return False
 874     if fileno not in WIN_OUTPUT_IDS:
 875         return False
 876
 877     GetStdHandle = ctypes.WINFUNCTYPE(
 878         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 879         (b"GetStdHandle", ctypes.windll.kernel32))
 880     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 881
 882     WriteConsoleW = ctypes.WINFUNCTYPE(
 883         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 884         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 885         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 886     written = ctypes.wintypes.DWORD(0)
 887
 888     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 889     FILE_TYPE_CHAR = 0x0002
 890     FILE_TYPE_REMOTE = 0x8000
 891     GetConsoleMode = ctypes.WINFUNCTYPE(
 892         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 893         ctypes.POINTER(ctypes.wintypes.DWORD))(
 894         (b"GetConsoleMode", ctypes.windll.kernel32))
 895     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 896
 897     def not_a_console(handle):
 898         if handle == INVALID_HANDLE_VALUE or handle is None:
 899             return True
 900         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 901                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 902
 903     if not_a_console(h):
 904         return False
 905
 906     def next_nonbmp_pos(s):
 907         try:
 908             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 909         except StopIteration:
 910             return len(s)
 911
 912     while s:
 913         count = min(next_nonbmp_pos(s), 1024)
 914
 915         ret = WriteConsoleW(
 916             h, s, count if count else 2, ctypes.byref(written), None)
 917         if ret == 0:
 918             raise OSError('Failed to write string')
 919         if not count:  # We just wrote a non-BMP character
 920             assert written.value == 2
 921             s = s[1:]
 922         else:
 923             assert written.value > 0
 924             s = s[written.value:]
 925     return True
 926
 927
 928 def write_string(s, out=None, encoding=None):
 929     if out is None:
 930         out = sys.stderr
 931     assert type(s) == compat_str
 932
 933     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 934         if _windows_write_string(s, out):
 935             return
 936
 937     if ('b' in getattr(out, 'mode', '') or
 938             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 939         byt = s.encode(encoding or preferredencoding(), 'ignore')
 940         out.write(byt)
 941     elif hasattr(out, 'buffer'):
 942         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 943         byt = s.encode(enc, 'ignore')
 944         out.buffer.write(byt)
 945     else:
 946         out.write(s)
 947     out.flush()
 948
 949
 950 def bytes_to_intlist(bs):
 951     if not bs:
 952         return []
 953     if isinstance(bs[0], int):  # Python 3
 954         return list(bs)
 955     else:
 956         return [ord(c) for c in bs]
 957
 958
 959 def intlist_to_bytes(xs):
 960     if not xs:
 961         return b''
 962     return struct_pack('%dB' % len(xs), *xs)
 963
 964
 965 # Cross-platform file locking
 966 if sys.platform == 'win32':
 967     import ctypes.wintypes
 968     import msvcrt
 969
 970     class OVERLAPPED(ctypes.Structure):
 971         _fields_ = [
 972             ('Internal', ctypes.wintypes.LPVOID),
 973             ('InternalHigh', ctypes.wintypes.LPVOID),
 974             ('Offset', ctypes.wintypes.DWORD),
 975             ('OffsetHigh', ctypes.wintypes.DWORD),
 976             ('hEvent', ctypes.wintypes.HANDLE),
 977         ]
 978
 979     kernel32 = ctypes.windll.kernel32
 980     LockFileEx = kernel32.LockFileEx
 981     LockFileEx.argtypes = [
 982         ctypes.wintypes.HANDLE,     # hFile
 983         ctypes.wintypes.DWORD,      # dwFlags
 984         ctypes.wintypes.DWORD,      # dwReserved
 985         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 986         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 987         ctypes.POINTER(OVERLAPPED)  # Overlapped
 988     ]
 989     LockFileEx.restype = ctypes.wintypes.BOOL
 990     UnlockFileEx = kernel32.UnlockFileEx
 991     UnlockFileEx.argtypes = [
 992         ctypes.wintypes.HANDLE,     # hFile
 993         ctypes.wintypes.DWORD,      # dwReserved
 994         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 995         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 996         ctypes.POINTER(OVERLAPPED)  # Overlapped
 997     ]
 998     UnlockFileEx.restype = ctypes.wintypes.BOOL
 999     whole_low = 0xffffffff
1000     whole_high = 0x7fffffff
1001
1002     def _lock_file(f, exclusive):
1003         overlapped = OVERLAPPED()
1004         overlapped.Offset = 0
1005         overlapped.OffsetHigh = 0
1006         overlapped.hEvent = 0
1007         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1008         handle = msvcrt.get_osfhandle(f.fileno())
1009         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1010                           whole_low, whole_high, f._lock_file_overlapped_p):
1011             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1012
1013     def _unlock_file(f):
1014         assert f._lock_file_overlapped_p
1015         handle = msvcrt.get_osfhandle(f.fileno())
1016         if not UnlockFileEx(handle, 0,
1017                             whole_low, whole_high, f._lock_file_overlapped_p):
1018             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1019
1020 else:
1021     import fcntl
1022
1023     def _lock_file(f, exclusive):
1024         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1025
1026     def _unlock_file(f):
1027         fcntl.flock(f, fcntl.LOCK_UN)
1028
1029
1030 class locked_file(object):
1031     def __init__(self, filename, mode, encoding=None):
1032         assert mode in ['r', 'a', 'w']
1033         self.f = io.open(filename, mode, encoding=encoding)
1034         self.mode = mode
1035
1036     def __enter__(self):
1037         exclusive = self.mode != 'r'
1038         try:
1039             _lock_file(self.f, exclusive)
1040         except IOError:
1041             self.f.close()
1042             raise
1043         return self
1044
1045     def __exit__(self, etype, value, traceback):
1046         try:
1047             _unlock_file(self.f)
1048         finally:
1049             self.f.close()
1050
1051     def __iter__(self):
1052         return iter(self.f)
1053
1054     def write(self, *args):
1055         return self.f.write(*args)
1056
1057     def read(self, *args):
1058         return self.f.read(*args)
1059
1060
1061 def get_filesystem_encoding():
1062     encoding = sys.getfilesystemencoding()
1063     return encoding if encoding is not None else 'utf-8'
1064
1065
1066 def shell_quote(args):
1067     quoted_args = []
1068     encoding = get_filesystem_encoding()
1069     for a in args:
1070         if isinstance(a, bytes):
1071             # We may get a filename encoded with 'encodeFilename'
1072             a = a.decode(encoding)
1073         quoted_args.append(pipes.quote(a))
1074     return ' '.join(quoted_args)
1075
1076
1077 def takewhile_inclusive(pred, seq):
1078     """ Like itertools.takewhile, but include the latest evaluated element
1079         (the first element so that Not pred(e)) """
1080     for e in seq:
1081         yield e
1082         if not pred(e):
1083             return
1084
1085
1086 def smuggle_url(url, data):
1087     """ Pass additional data in a URL for internal use. """
1088
1089     sdata = compat_urllib_parse.urlencode(
1090         {'__youtubedl_smuggle': json.dumps(data)})
1091     return url + '#' + sdata
1092
1093
1094 def unsmuggle_url(smug_url, default=None):
1095     if '#__youtubedl_smuggle' not in smug_url:
1096         return smug_url, default
1097     url, _, sdata = smug_url.rpartition('#')
1098     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1099     data = json.loads(jsond)
1100     return url, data
1101
1102
1103 def format_bytes(bytes):
1104     if bytes is None:
1105         return 'N/A'
1106     if type(bytes) is str:
1107         bytes = float(bytes)
1108     if bytes == 0.0:
1109         exponent = 0
1110     else:
1111         exponent = int(math.log(bytes, 1024.0))
1112     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1113     converted = float(bytes) / float(1024 ** exponent)
1114     return '%.2f%s' % (converted, suffix)
1115
1116
1117 def parse_filesize(s):
1118     if s is None:
1119         return None
1120
1121     # The lower-case forms are of course incorrect and inofficial,
1122     # but we support those too
1123     _UNIT_TABLE = {
1124         'B': 1,
1125         'b': 1,
1126         'KiB': 1024,
1127         'KB': 1000,
1128         'kB': 1024,
1129         'Kb': 1000,
1130         'MiB': 1024 ** 2,
1131         'MB': 1000 ** 2,
1132         'mB': 1024 ** 2,
1133         'Mb': 1000 ** 2,
1134         'GiB': 1024 ** 3,
1135         'GB': 1000 ** 3,
1136         'gB': 1024 ** 3,
1137         'Gb': 1000 ** 3,
1138         'TiB': 1024 ** 4,
1139         'TB': 1000 ** 4,
1140         'tB': 1024 ** 4,
1141         'Tb': 1000 ** 4,
1142         'PiB': 1024 ** 5,
1143         'PB': 1000 ** 5,
1144         'pB': 1024 ** 5,
1145         'Pb': 1000 ** 5,
1146         'EiB': 1024 ** 6,
1147         'EB': 1000 ** 6,
1148         'eB': 1024 ** 6,
1149         'Eb': 1000 ** 6,
1150         'ZiB': 1024 ** 7,
1151         'ZB': 1000 ** 7,
1152         'zB': 1024 ** 7,
1153         'Zb': 1000 ** 7,
1154         'YiB': 1024 ** 8,
1155         'YB': 1000 ** 8,
1156         'yB': 1024 ** 8,
1157         'Yb': 1000 ** 8,
1158     }
1159
1160     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1161     m = re.match(
1162         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1163     if not m:
1164         return None
1165
1166     num_str = m.group('num').replace(',', '.')
1167     mult = _UNIT_TABLE[m.group('unit')]
1168     return int(float(num_str) * mult)
1169
1170
1171 def get_term_width():
1172     columns = compat_getenv('COLUMNS', None)
1173     if columns:
1174         return int(columns)
1175
1176     try:
1177         sp = subprocess.Popen(
1178             ['stty', 'size'],
1179             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1180         out, err = sp.communicate()
1181         return int(out.split()[1])
1182     except:
1183         pass
1184     return None
1185
1186
1187 def month_by_name(name):
1188     """ Return the number of a month by (locale-independently) English name """
1189
1190     ENGLISH_NAMES = [
1191         'January', 'February', 'March', 'April', 'May', 'June',
1192         'July', 'August', 'September', 'October', 'November', 'December']
1193     try:
1194         return ENGLISH_NAMES.index(name) + 1
1195     except ValueError:
1196         return None
1197
1198
1199 def fix_xml_ampersands(xml_str):
1200     """Replace all the '&' by '&amp;' in XML"""
1201     return re.sub(
1202         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1203         '&amp;',
1204         xml_str)
1205
1206
1207 def setproctitle(title):
1208     assert isinstance(title, compat_str)
1209     try:
1210         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1211     except OSError:
1212         return
1213     title_bytes = title.encode('utf-8')
1214     buf = ctypes.create_string_buffer(len(title_bytes))
1215     buf.value = title_bytes
1216     try:
1217         libc.prctl(15, buf, 0, 0, 0)
1218     except AttributeError:
1219         return  # Strange libc, just skip this
1220
1221
1222 def remove_start(s, start):
1223     if s.startswith(start):
1224         return s[len(start):]
1225     return s
1226
1227
1228 def remove_end(s, end):
1229     if s.endswith(end):
1230         return s[:-len(end)]
1231     return s
1232
1233
1234 def url_basename(url):
1235     path = compat_urlparse.urlparse(url).path
1236     return path.strip('/').split('/')[-1]
1237
1238
1239 class HEADRequest(compat_urllib_request.Request):
1240     def get_method(self):
1241         return "HEAD"
1242
1243
1244 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1245     if get_attr:
1246         if v is not None:
1247             v = getattr(v, get_attr, None)
1248     if v == '':
1249         v = None
1250     return default if v is None else (int(v) * invscale // scale)
1251
1252
1253 def str_or_none(v, default=None):
1254     return default if v is None else compat_str(v)
1255
1256
1257 def str_to_int(int_str):
1258     """ A more relaxed version of int_or_none """
1259     if int_str is None:
1260         return None
1261     int_str = re.sub(r'[,\.\+]', '', int_str)
1262     return int(int_str)
1263
1264
1265 def float_or_none(v, scale=1, invscale=1, default=None):
1266     return default if v is None else (float(v) * invscale / scale)
1267
1268
1269 def parse_duration(s):
1270     if s is None:
1271         return None
1272
1273     s = s.strip()
1274
1275     m = re.match(
1276         r'''(?ix)T?
1277         (?:
1278             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1279             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1280
1281             (?:
1282                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1283                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1284             )?
1285             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1286         )$''', s)
1287     if not m:
1288         return None
1289     res = 0
1290     if m.group('only_mins'):
1291         return float_or_none(m.group('only_mins'), invscale=60)
1292     if m.group('only_hours'):
1293         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1294     if m.group('secs'):
1295         res += int(m.group('secs'))
1296     if m.group('mins'):
1297         res += int(m.group('mins')) * 60
1298     if m.group('hours'):
1299         res += int(m.group('hours')) * 60 * 60
1300     if m.group('ms'):
1301         res += float(m.group('ms'))
1302     return res
1303
1304
1305 def prepend_extension(filename, ext):
1306     name, real_ext = os.path.splitext(filename)
1307     return '{0}.{1}{2}'.format(name, ext, real_ext)
1308
1309
1310 def check_executable(exe, args=[]):
1311     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1312     args can be a list of arguments for a short output (like -version) """
1313     try:
1314         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1315     except OSError:
1316         return False
1317     return exe
1318
1319
1320 def get_exe_version(exe, args=['--version'],
1321                     version_re=None, unrecognized='present'):
1322     """ Returns the version of the specified executable,
1323     or False if the executable is not present """
1324     try:
1325         out, _ = subprocess.Popen(
1326             [exe] + args,
1327             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1328     except OSError:
1329         return False
1330     if isinstance(out, bytes):  # Python 2.x
1331         out = out.decode('ascii', 'ignore')
1332     return detect_exe_version(out, version_re, unrecognized)
1333
1334
1335 def detect_exe_version(output, version_re=None, unrecognized='present'):
1336     assert isinstance(output, compat_str)
1337     if version_re is None:
1338         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1339     m = re.search(version_re, output)
1340     if m:
1341         return m.group(1)
1342     else:
1343         return unrecognized
1344
1345
1346 class PagedList(object):
1347     def __len__(self):
1348         # This is only useful for tests
1349         return len(self.getslice())
1350
1351
1352 class OnDemandPagedList(PagedList):
1353     def __init__(self, pagefunc, pagesize):
1354         self._pagefunc = pagefunc
1355         self._pagesize = pagesize
1356
1357     def getslice(self, start=0, end=None):
1358         res = []
1359         for pagenum in itertools.count(start // self._pagesize):
1360             firstid = pagenum * self._pagesize
1361             nextfirstid = pagenum * self._pagesize + self._pagesize
1362             if start >= nextfirstid:
1363                 continue
1364
1365             page_results = list(self._pagefunc(pagenum))
1366
1367             startv = (
1368                 start % self._pagesize
1369                 if firstid <= start < nextfirstid
1370                 else 0)
1371
1372             endv = (
1373                 ((end - 1) % self._pagesize) + 1
1374                 if (end is not None and firstid <= end <= nextfirstid)
1375                 else None)
1376
1377             if startv != 0 or endv is not None:
1378                 page_results = page_results[startv:endv]
1379             res.extend(page_results)
1380
1381             # A little optimization - if current page is not "full", ie. does
1382             # not contain page_size videos then we can assume that this page
1383             # is the last one - there are no more ids on further pages -
1384             # i.e. no need to query again.
1385             if len(page_results) + startv < self._pagesize:
1386                 break
1387
1388             # If we got the whole page, but the next page is not interesting,
1389             # break out early as well
1390             if end == nextfirstid:
1391                 break
1392         return res
1393
1394
1395 class InAdvancePagedList(PagedList):
1396     def __init__(self, pagefunc, pagecount, pagesize):
1397         self._pagefunc = pagefunc
1398         self._pagecount = pagecount
1399         self._pagesize = pagesize
1400
1401     def getslice(self, start=0, end=None):
1402         res = []
1403         start_page = start // self._pagesize
1404         end_page = (
1405             self._pagecount if end is None else (end // self._pagesize + 1))
1406         skip_elems = start - start_page * self._pagesize
1407         only_more = None if end is None else end - start
1408         for pagenum in range(start_page, end_page):
1409             page = list(self._pagefunc(pagenum))
1410             if skip_elems:
1411                 page = page[skip_elems:]
1412                 skip_elems = None
1413             if only_more is not None:
1414                 if len(page) < only_more:
1415                     only_more -= len(page)
1416                 else:
1417                     page = page[:only_more]
1418                     res.extend(page)
1419                     break
1420             res.extend(page)
1421         return res
1422
1423
1424 def uppercase_escape(s):
1425     unicode_escape = codecs.getdecoder('unicode_escape')
1426     return re.sub(
1427         r'\\U[0-9a-fA-F]{8}',
1428         lambda m: unicode_escape(m.group(0))[0],
1429         s)
1430
1431
1432 def escape_rfc3986(s):
1433     """Escape non-ASCII characters as suggested by RFC 3986"""
1434     if sys.version_info < (3, 0) and isinstance(s, unicode):
1435         s = s.encode('utf-8')
1436     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1437
1438
1439 def escape_url(url):
1440     """Escape URL as suggested by RFC 3986"""
1441     url_parsed = compat_urllib_parse_urlparse(url)
1442     return url_parsed._replace(
1443         path=escape_rfc3986(url_parsed.path),
1444         params=escape_rfc3986(url_parsed.params),
1445         query=escape_rfc3986(url_parsed.query),
1446         fragment=escape_rfc3986(url_parsed.fragment)
1447     ).geturl()
1448
1449 try:
1450     struct.pack('!I', 0)
1451 except TypeError:
1452     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1453     def struct_pack(spec, *args):
1454         if isinstance(spec, compat_str):
1455             spec = spec.encode('ascii')
1456         return struct.pack(spec, *args)
1457
1458     def struct_unpack(spec, *args):
1459         if isinstance(spec, compat_str):
1460             spec = spec.encode('ascii')
1461         return struct.unpack(spec, *args)
1462 else:
1463     struct_pack = struct.pack
1464     struct_unpack = struct.unpack
1465
1466
1467 def read_batch_urls(batch_fd):
1468     def fixup(url):
1469         if not isinstance(url, compat_str):
1470             url = url.decode('utf-8', 'replace')
1471         BOM_UTF8 = '\xef\xbb\xbf'
1472         if url.startswith(BOM_UTF8):
1473             url = url[len(BOM_UTF8):]
1474         url = url.strip()
1475         if url.startswith(('#', ';', ']')):
1476             return False
1477         return url
1478
1479     with contextlib.closing(batch_fd) as fd:
1480         return [url for url in map(fixup, fd) if url]
1481
1482
1483 def urlencode_postdata(*args, **kargs):
1484     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1485
1486
1487 try:
1488     etree_iter = xml.etree.ElementTree.Element.iter
1489 except AttributeError:  # Python <=2.6
1490     etree_iter = lambda n: n.findall('.//*')
1491
1492
1493 def parse_xml(s):
1494     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1495         def doctype(self, name, pubid, system):
1496             pass  # Ignore doctypes
1497
1498     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1499     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1500     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1501     # Fix up XML parser in Python 2.x
1502     if sys.version_info < (3, 0):
1503         for n in etree_iter(tree):
1504             if n.text is not None:
1505                 if not isinstance(n.text, compat_str):
1506                     n.text = n.text.decode('utf-8')
1507     return tree
1508
1509
1510 US_RATINGS = {
1511     'G': 0,
1512     'PG': 10,
1513     'PG-13': 13,
1514     'R': 16,
1515     'NC': 18,
1516 }
1517
1518
1519 def parse_age_limit(s):
1520     if s is None:
1521         return None
1522     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1523     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1524
1525
1526 def strip_jsonp(code):
1527     return re.sub(
1528         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1529
1530
1531 def js_to_json(code):
1532     def fix_kv(m):
1533         v = m.group(0)
1534         if v in ('true', 'false', 'null'):
1535             return v
1536         if v.startswith('"'):
1537             return v
1538         if v.startswith("'"):
1539             v = v[1:-1]
1540             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1541                 '\\\\': '\\\\',
1542                 "\\'": "'",
1543                 '"': '\\"',
1544             }[m.group(0)], v)
1545         return '"%s"' % v
1546
1547     res = re.sub(r'''(?x)
1548         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1549         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1550         [a-zA-Z_][a-zA-Z_0-9]*
1551         ''', fix_kv, code)
1552     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1553     return res
1554
1555
1556 def qualities(quality_ids):
1557     """ Get a numeric quality value out of a list of possible values """
1558     def q(qid):
1559         try:
1560             return quality_ids.index(qid)
1561         except ValueError:
1562             return -1
1563     return q
1564
1565
1566 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1567
1568
1569 def limit_length(s, length):
1570     """ Add ellipses to overly long strings """
1571     if s is None:
1572         return None
1573     ELLIPSES = '...'
1574     if len(s) > length:
1575         return s[:length - len(ELLIPSES)] + ELLIPSES
1576     return s
1577
1578
1579 def version_tuple(v):
1580     return tuple(int(e) for e in re.split(r'[-.]', v))
1581
1582
1583 def is_outdated_version(version, limit, assume_new=True):
1584     if not version:
1585         return not assume_new
1586     try:
1587         return version_tuple(version) < version_tuple(limit)
1588     except ValueError:
1589         return not assume_new
1590
1591
1592 def ytdl_is_updateable():
1593     """ Returns if youtube-dl can be updated with -U """
1594     from zipimport import zipimporter
1595
1596     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1597
1598
1599 def args_to_str(args):
1600     # Get a short string representation for a subprocess command
1601     return ' '.join(shlex_quote(a) for a in args)
1602
1603
1604 def urlhandle_detect_ext(url_handle):
1605     try:
1606         url_handle.headers
1607         getheader = lambda h: url_handle.headers[h]
1608     except AttributeError:  # Python < 3
1609         getheader = url_handle.info().getheader
1610
1611     return getheader('Content-Type').split("/")[1]
1612
1613
1614 def age_restricted(content_limit, age_limit):
1615     """ Returns True iff the content should be blocked """
1616
1617     if age_limit is None:  # No limit set
1618         return False
1619     if content_limit is None:
1620         return False  # Content available for everyone
1621     return age_limit < content_limit