youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import os
  21 import pipes
  22 import platform
  23 import re
  24 import ssl
  25 import socket
  26 import struct
  27 import subprocess
  28 import sys
  29 import tempfile
  30 import traceback
  31 import xml.etree.ElementTree
  32 import zlib
  33
  34 from .compat import (
  35     compat_chr,
  36     compat_getenv,
  37     compat_html_entities,
  38     compat_http_client,
  39     compat_parse_qs,
  40     compat_socket_create_connection,
  41     compat_str,
  42     compat_urllib_error,
  43     compat_urllib_parse,
  44     compat_urllib_parse_urlparse,
  45     compat_urllib_request,
  46     compat_urlparse,
  47     shlex_quote,
  48 )
  49
  50
  51 # This is not clearly defined otherwise
  52 compiled_regex_type = type(re.compile(''))
  53
  54 std_headers = {
  55     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  56     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  57     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  58     'Accept-Encoding': 'gzip, deflate',
  59     'Accept-Language': 'en-us,en;q=0.5',
  60 }
  61
  62
  63 def preferredencoding():
  64     """Get preferred encoding.
  65
  66     Returns the best encoding scheme for the system, based on
  67     locale.getpreferredencoding() and some further tweaks.
  68     """
  69     try:
  70         pref = locale.getpreferredencoding()
  71         'TEST'.encode(pref)
  72     except:
  73         pref = 'UTF-8'
  74
  75     return pref
  76
  77
  78 def write_json_file(obj, fn):
  79     """ Encode obj as JSON and write it to fn, atomically if possible """
  80
  81     fn = encodeFilename(fn)
  82     if sys.version_info < (3, 0) and sys.platform != 'win32':
  83         encoding = get_filesystem_encoding()
  84         # os.path.basename returns a bytes object, but NamedTemporaryFile
  85         # will fail if the filename contains non ascii characters unless we
  86         # use a unicode object
  87         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  88         # the same for os.path.dirname
  89         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  90     else:
  91         path_basename = os.path.basename
  92         path_dirname = os.path.dirname
  93
  94     args = {
  95         'suffix': '.tmp',
  96         'prefix': path_basename(fn) + '.',
  97         'dir': path_dirname(fn),
  98         'delete': False,
  99     }
 100
 101     # In Python 2.x, json.dump expects a bytestream.
 102     # In Python 3.x, it writes to a character stream
 103     if sys.version_info < (3, 0):
 104         args['mode'] = 'wb'
 105     else:
 106         args.update({
 107             'mode': 'w',
 108             'encoding': 'utf-8',
 109         })
 110
 111     tf = tempfile.NamedTemporaryFile(**args)
 112
 113     try:
 114         with tf:
 115             json.dump(obj, tf)
 116         if sys.platform == 'win32':
 117             # Need to remove existing file on Windows, else os.rename raises
 118             # WindowsError or FileExistsError.
 119             try:
 120                 os.unlink(fn)
 121             except OSError:
 122                 pass
 123         os.rename(tf.name, fn)
 124     except:
 125         try:
 126             os.remove(tf.name)
 127         except OSError:
 128             pass
 129         raise
 130
 131
 132 if sys.version_info >= (2, 7):
 133     def find_xpath_attr(node, xpath, key, val):
 134         """ Find the xpath xpath[@key=val] """
 135         assert re.match(r'^[a-zA-Z-]+$', key)
 136         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 137         expr = xpath + "[@%s='%s']" % (key, val)
 138         return node.find(expr)
 139 else:
 140     def find_xpath_attr(node, xpath, key, val):
 141         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 142         # .//node does not match if a node is a direct child of . !
 143         if isinstance(xpath, unicode):
 144             xpath = xpath.encode('ascii')
 145
 146         for f in node.findall(xpath):
 147             if f.attrib.get(key) == val:
 148                 return f
 149         return None
 150
 151 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 152 # the namespace parameter
 153
 154
 155 def xpath_with_ns(path, ns_map):
 156     components = [c.split(':') for c in path.split('/')]
 157     replaced = []
 158     for c in components:
 159         if len(c) == 1:
 160             replaced.append(c[0])
 161         else:
 162             ns, tag = c
 163             replaced.append('{%s}%s' % (ns_map[ns], tag))
 164     return '/'.join(replaced)
 165
 166
 167 def xpath_text(node, xpath, name=None, fatal=False):
 168     if sys.version_info < (2, 7):  # Crazy 2.6
 169         xpath = xpath.encode('ascii')
 170
 171     n = node.find(xpath)
 172     if n is None or n.text is None:
 173         if fatal:
 174             name = xpath if name is None else name
 175             raise ExtractorError('Could not find XML element %s' % name)
 176         else:
 177             return None
 178     return n.text
 179
 180
 181 def get_element_by_id(id, html):
 182     """Return the content of the tag with the specified ID in the passed HTML document"""
 183     return get_element_by_attribute("id", id, html)
 184
 185
 186 def get_element_by_attribute(attribute, value, html):
 187     """Return the content of the tag with the specified attribute in the passed HTML document"""
 188
 189     m = re.search(r'''(?xs)
 190         <([a-zA-Z0-9:._-]+)
 191          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 192          \s+%s=['"]?%s['"]?
 193          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 194         \s*>
 195         (?P<content>.*?)
 196         </\1>
 197     ''' % (re.escape(attribute), re.escape(value)), html)
 198
 199     if not m:
 200         return None
 201     res = m.group('content')
 202
 203     if res.startswith('"') or res.startswith("'"):
 204         res = res[1:-1]
 205
 206     return unescapeHTML(res)
 207
 208
 209 def clean_html(html):
 210     """Clean an HTML snippet into a readable string"""
 211
 212     if html is None:  # Convenience for sanitizing descriptions etc.
 213         return html
 214
 215     # Newline vs <br />
 216     html = html.replace('\n', ' ')
 217     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 218     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 219     # Strip html tags
 220     html = re.sub('<.*?>', '', html)
 221     # Replace html entities
 222     html = unescapeHTML(html)
 223     return html.strip()
 224
 225
 226 def sanitize_open(filename, open_mode):
 227     """Try to open the given filename, and slightly tweak it if this fails.
 228
 229     Attempts to open the given filename. If this fails, it tries to change
 230     the filename slightly, step by step, until it's either able to open it
 231     or it fails and raises a final exception, like the standard open()
 232     function.
 233
 234     It returns the tuple (stream, definitive_file_name).
 235     """
 236     try:
 237         if filename == '-':
 238             if sys.platform == 'win32':
 239                 import msvcrt
 240                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 241             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 242         stream = open(encodeFilename(filename), open_mode)
 243         return (stream, filename)
 244     except (IOError, OSError) as err:
 245         if err.errno in (errno.EACCES,):
 246             raise
 247
 248         # In case of error, try to remove win32 forbidden chars
 249         alt_filename = os.path.join(
 250             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 251             for path_part in os.path.split(filename)
 252         )
 253         if alt_filename == filename:
 254             raise
 255         else:
 256             # An exception here should be caught in the caller
 257             stream = open(encodeFilename(filename), open_mode)
 258             return (stream, alt_filename)
 259
 260
 261 def timeconvert(timestr):
 262     """Convert RFC 2822 defined time string into system timestamp"""
 263     timestamp = None
 264     timetuple = email.utils.parsedate_tz(timestr)
 265     if timetuple is not None:
 266         timestamp = email.utils.mktime_tz(timetuple)
 267     return timestamp
 268
 269
 270 def sanitize_filename(s, restricted=False, is_id=False):
 271     """Sanitizes a string so it could be used as part of a filename.
 272     If restricted is set, use a stricter subset of allowed characters.
 273     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 274     """
 275     def replace_insane(char):
 276         if char == '?' or ord(char) < 32 or ord(char) == 127:
 277             return ''
 278         elif char == '"':
 279             return '' if restricted else '\''
 280         elif char == ':':
 281             return '_-' if restricted else ' -'
 282         elif char in '\\/|*<>':
 283             return '_'
 284         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 285             return '_'
 286         if restricted and ord(char) > 127:
 287             return '_'
 288         return char
 289
 290     # Handle timestamps
 291     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 292     result = ''.join(map(replace_insane, s))
 293     if not is_id:
 294         while '__' in result:
 295             result = result.replace('__', '_')
 296         result = result.strip('_')
 297         # Common case of "Foreign band name - English song title"
 298         if restricted and result.startswith('-_'):
 299             result = result[2:]
 300         if not result:
 301             result = '_'
 302     return result
 303
 304
 305 def orderedSet(iterable):
 306     """ Remove all duplicates from the input iterable """
 307     res = []
 308     for el in iterable:
 309         if el not in res:
 310             res.append(el)
 311     return res
 312
 313
 314 def _htmlentity_transform(entity):
 315     """Transforms an HTML entity to a character."""
 316     # Known non-numeric HTML entity
 317     if entity in compat_html_entities.name2codepoint:
 318         return compat_chr(compat_html_entities.name2codepoint[entity])
 319
 320     mobj = re.match(r'#(x?[0-9]+)', entity)
 321     if mobj is not None:
 322         numstr = mobj.group(1)
 323         if numstr.startswith('x'):
 324             base = 16
 325             numstr = '0%s' % numstr
 326         else:
 327             base = 10
 328         return compat_chr(int(numstr, base))
 329
 330     # Unknown entity in name, return its literal representation
 331     return ('&%s;' % entity)
 332
 333
 334 def unescapeHTML(s):
 335     if s is None:
 336         return None
 337     assert type(s) == compat_str
 338
 339     return re.sub(
 340         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 341
 342
 343 def encodeFilename(s, for_subprocess=False):
 344     """
 345     @param s The name of the file
 346     """
 347
 348     assert type(s) == compat_str
 349
 350     # Python 3 has a Unicode API
 351     if sys.version_info >= (3, 0):
 352         return s
 353
 354     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 355         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 356         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 357         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 358         if not for_subprocess:
 359             return s
 360         else:
 361             # For subprocess calls, encode with locale encoding
 362             # Refer to http://stackoverflow.com/a/9951851/35070
 363             encoding = preferredencoding()
 364     else:
 365         encoding = sys.getfilesystemencoding()
 366     if encoding is None:
 367         encoding = 'utf-8'
 368     return s.encode(encoding, 'ignore')
 369
 370
 371 def encodeArgument(s):
 372     if not isinstance(s, compat_str):
 373         # Legacy code that uses byte strings
 374         # Uncomment the following line after fixing all post processors
 375         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 376         s = s.decode('ascii')
 377     return encodeFilename(s, True)
 378
 379
 380 def decodeOption(optval):
 381     if optval is None:
 382         return optval
 383     if isinstance(optval, bytes):
 384         optval = optval.decode(preferredencoding())
 385
 386     assert isinstance(optval, compat_str)
 387     return optval
 388
 389
 390 def formatSeconds(secs):
 391     if secs > 3600:
 392         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 393     elif secs > 60:
 394         return '%d:%02d' % (secs // 60, secs % 60)
 395     else:
 396         return '%d' % secs
 397
 398
 399 def make_HTTPS_handler(params, **kwargs):
 400     opts_no_check_certificate = params.get('nocheckcertificate', False)
 401     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 402         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 403         if opts_no_check_certificate:
 404             context.verify_mode = ssl.CERT_NONE
 405         try:
 406             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 407         except TypeError:
 408             # Python 2.7.8
 409             # (create_default_context present but HTTPSHandler has no context=)
 410             pass
 411
 412     if sys.version_info < (3, 2):
 413         import httplib
 414
 415         class HTTPSConnectionV3(httplib.HTTPSConnection):
 416             def __init__(self, *args, **kwargs):
 417                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 418
 419             def connect(self):
 420                 sock = socket.create_connection((self.host, self.port), self.timeout)
 421                 if getattr(self, '_tunnel_host', False):
 422                     self.sock = sock
 423                     self._tunnel()
 424                 try:
 425                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 426                 except ssl.SSLError:
 427                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 428
 429         return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
 430     else:  # Python < 3.4
 431         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 432         context.verify_mode = (ssl.CERT_NONE
 433                                if opts_no_check_certificate
 434                                else ssl.CERT_REQUIRED)
 435         context.set_default_verify_paths()
 436         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 437
 438
 439 class ExtractorError(Exception):
 440     """Error during info extraction."""
 441
 442     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 443         """ tb, if given, is the original traceback (so that it can be printed out).
 444         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 445         """
 446
 447         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 448             expected = True
 449         if video_id is not None:
 450             msg = video_id + ': ' + msg
 451         if cause:
 452             msg += ' (caused by %r)' % cause
 453         if not expected:
 454             if ytdl_is_updateable():
 455                 update_cmd = 'type  youtube-dl -U  to update'
 456             else:
 457                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 458             msg += '; please report this issue on https://yt-dl.org/bug .'
 459             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 460             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 461         super(ExtractorError, self).__init__(msg)
 462
 463         self.traceback = tb
 464         self.exc_info = sys.exc_info()  # preserve original exception
 465         self.cause = cause
 466         self.video_id = video_id
 467
 468     def format_traceback(self):
 469         if self.traceback is None:
 470             return None
 471         return ''.join(traceback.format_tb(self.traceback))
 472
 473
 474 class UnsupportedError(ExtractorError):
 475     def __init__(self, url):
 476         super(UnsupportedError, self).__init__(
 477             'Unsupported URL: %s' % url, expected=True)
 478         self.url = url
 479
 480
 481 class RegexNotFoundError(ExtractorError):
 482     """Error when a regex didn't match"""
 483     pass
 484
 485
 486 class DownloadError(Exception):
 487     """Download Error exception.
 488
 489     This exception may be thrown by FileDownloader objects if they are not
 490     configured to continue on errors. They will contain the appropriate
 491     error message.
 492     """
 493
 494     def __init__(self, msg, exc_info=None):
 495         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 496         super(DownloadError, self).__init__(msg)
 497         self.exc_info = exc_info
 498
 499
 500 class SameFileError(Exception):
 501     """Same File exception.
 502
 503     This exception will be thrown by FileDownloader objects if they detect
 504     multiple files would have to be downloaded to the same file on disk.
 505     """
 506     pass
 507
 508
 509 class PostProcessingError(Exception):
 510     """Post Processing exception.
 511
 512     This exception may be raised by PostProcessor's .run() method to
 513     indicate an error in the postprocessing task.
 514     """
 515
 516     def __init__(self, msg):
 517         self.msg = msg
 518
 519
 520 class MaxDownloadsReached(Exception):
 521     """ --max-downloads limit has been reached. """
 522     pass
 523
 524
 525 class UnavailableVideoError(Exception):
 526     """Unavailable Format exception.
 527
 528     This exception will be thrown when a video is requested
 529     in a format that is not available for that video.
 530     """
 531     pass
 532
 533
 534 class ContentTooShortError(Exception):
 535     """Content Too Short exception.
 536
 537     This exception may be raised by FileDownloader objects when a file they
 538     download is too small for what the server announced first, indicating
 539     the connection was probably interrupted.
 540     """
 541     # Both in bytes
 542     downloaded = None
 543     expected = None
 544
 545     def __init__(self, downloaded, expected):
 546         self.downloaded = downloaded
 547         self.expected = expected
 548
 549
 550 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 551     hc = http_class(*args, **kwargs)
 552     source_address = ydl_handler._params.get('source_address')
 553     if source_address is not None:
 554         sa = (source_address, 0)
 555         if hasattr(hc, 'source_address'):  # Python 2.7+
 556             hc.source_address = sa
 557         else:  # Python 2.6
 558             def _hc_connect(self, *args, **kwargs):
 559                 sock = compat_socket_create_connection(
 560                     (self.host, self.port), self.timeout, sa)
 561                 if is_https:
 562                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
 563                 else:
 564                     self.sock = sock
 565             hc.connect = functools.partial(_hc_connect, hc)
 566
 567     return hc
 568
 569
 570 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 571     """Handler for HTTP requests and responses.
 572
 573     This class, when installed with an OpenerDirector, automatically adds
 574     the standard headers to every HTTP request and handles gzipped and
 575     deflated responses from web servers. If compression is to be avoided in
 576     a particular request, the original request in the program code only has
 577     to include the HTTP header "Youtubedl-No-Compression", which will be
 578     removed before making the real request.
 579
 580     Part of this code was copied from:
 581
 582     http://techknack.net/python-urllib2-handlers/
 583
 584     Andrew Rowls, the author of that code, agreed to release it to the
 585     public domain.
 586     """
 587
 588     def __init__(self, params, *args, **kwargs):
 589         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 590         self._params = params
 591
 592     def http_open(self, req):
 593         return self.do_open(functools.partial(
 594             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 595             req)
 596
 597     @staticmethod
 598     def deflate(data):
 599         try:
 600             return zlib.decompress(data, -zlib.MAX_WBITS)
 601         except zlib.error:
 602             return zlib.decompress(data)
 603
 604     @staticmethod
 605     def addinfourl_wrapper(stream, headers, url, code):
 606         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 607             return compat_urllib_request.addinfourl(stream, headers, url, code)
 608         ret = compat_urllib_request.addinfourl(stream, headers, url)
 609         ret.code = code
 610         return ret
 611
 612     def http_request(self, req):
 613         for h, v in std_headers.items():
 614             if h not in req.headers:
 615                 req.add_header(h, v)
 616         if 'Youtubedl-no-compression' in req.headers:
 617             if 'Accept-encoding' in req.headers:
 618                 del req.headers['Accept-encoding']
 619             del req.headers['Youtubedl-no-compression']
 620         if 'Youtubedl-user-agent' in req.headers:
 621             if 'User-agent' in req.headers:
 622                 del req.headers['User-agent']
 623             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 624             del req.headers['Youtubedl-user-agent']
 625
 626         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 627             # Python 2.6 is brain-dead when it comes to fragments
 628             req._Request__original = req._Request__original.partition('#')[0]
 629             req._Request__r_type = req._Request__r_type.partition('#')[0]
 630
 631         return req
 632
 633     def http_response(self, req, resp):
 634         old_resp = resp
 635         # gzip
 636         if resp.headers.get('Content-encoding', '') == 'gzip':
 637             content = resp.read()
 638             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 639             try:
 640                 uncompressed = io.BytesIO(gz.read())
 641             except IOError as original_ioerror:
 642                 # There may be junk add the end of the file
 643                 # See http://stackoverflow.com/q/4928560/35070 for details
 644                 for i in range(1, 1024):
 645                     try:
 646                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 647                         uncompressed = io.BytesIO(gz.read())
 648                     except IOError:
 649                         continue
 650                     break
 651                 else:
 652                     raise original_ioerror
 653             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 654             resp.msg = old_resp.msg
 655         # deflate
 656         if resp.headers.get('Content-encoding', '') == 'deflate':
 657             gz = io.BytesIO(self.deflate(resp.read()))
 658             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 659             resp.msg = old_resp.msg
 660         return resp
 661
 662     https_request = http_request
 663     https_response = http_response
 664
 665
 666 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 667     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 668         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 669         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 670         self._params = params
 671
 672     def https_open(self, req):
 673         return self.do_open(functools.partial(
 674             _create_http_connection, self, self._https_conn_class, True),
 675             req)
 676
 677
 678 def parse_iso8601(date_str, delimiter='T'):
 679     """ Return a UNIX timestamp from the given date """
 680
 681     if date_str is None:
 682         return None
 683
 684     m = re.search(
 685         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 686         date_str)
 687     if not m:
 688         timezone = datetime.timedelta()
 689     else:
 690         date_str = date_str[:-len(m.group(0))]
 691         if not m.group('sign'):
 692             timezone = datetime.timedelta()
 693         else:
 694             sign = 1 if m.group('sign') == '+' else -1
 695             timezone = datetime.timedelta(
 696                 hours=sign * int(m.group('hours')),
 697                 minutes=sign * int(m.group('minutes')))
 698     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 699     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 700     return calendar.timegm(dt.timetuple())
 701
 702
 703 def unified_strdate(date_str, day_first=True):
 704     """Return a string with the date in the format YYYYMMDD"""
 705
 706     if date_str is None:
 707         return None
 708     upload_date = None
 709     # Replace commas
 710     date_str = date_str.replace(',', ' ')
 711     # %z (UTC offset) is only supported in python>=3.2
 712     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 713     # Remove AM/PM + timezone
 714     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 715
 716     format_expressions = [
 717         '%d %B %Y',
 718         '%d %b %Y',
 719         '%B %d %Y',
 720         '%b %d %Y',
 721         '%b %dst %Y %I:%M%p',
 722         '%b %dnd %Y %I:%M%p',
 723         '%b %dth %Y %I:%M%p',
 724         '%Y-%m-%d',
 725         '%Y/%m/%d',
 726         '%Y/%m/%d %H:%M:%S',
 727         '%Y-%m-%d %H:%M:%S',
 728         '%Y-%m-%d %H:%M:%S.%f',
 729         '%d.%m.%Y %H:%M',
 730         '%d.%m.%Y %H.%M',
 731         '%Y-%m-%dT%H:%M:%SZ',
 732         '%Y-%m-%dT%H:%M:%S.%fZ',
 733         '%Y-%m-%dT%H:%M:%S.%f0Z',
 734         '%Y-%m-%dT%H:%M:%S',
 735         '%Y-%m-%dT%H:%M:%S.%f',
 736         '%Y-%m-%dT%H:%M',
 737     ]
 738     if day_first:
 739         format_expressions.extend([
 740             '%d.%m.%Y',
 741             '%d/%m/%Y',
 742             '%d/%m/%y',
 743             '%d/%m/%Y %H:%M:%S',
 744         ])
 745     else:
 746         format_expressions.extend([
 747             '%m.%d.%Y',
 748             '%m/%d/%Y',
 749             '%m/%d/%y',
 750             '%m/%d/%Y %H:%M:%S',
 751         ])
 752     for expression in format_expressions:
 753         try:
 754             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 755         except ValueError:
 756             pass
 757     if upload_date is None:
 758         timetuple = email.utils.parsedate_tz(date_str)
 759         if timetuple:
 760             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 761     return upload_date
 762
 763
 764 def determine_ext(url, default_ext='unknown_video'):
 765     if url is None:
 766         return default_ext
 767     guess = url.partition('?')[0].rpartition('.')[2]
 768     if re.match(r'^[A-Za-z0-9]+$', guess):
 769         return guess
 770     else:
 771         return default_ext
 772
 773
 774 def subtitles_filename(filename, sub_lang, sub_format):
 775     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 776
 777
 778 def date_from_str(date_str):
 779     """
 780     Return a datetime object from a string in the format YYYYMMDD or
 781     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 782     today = datetime.date.today()
 783     if date_str in ('now', 'today'):
 784         return today
 785     if date_str == 'yesterday':
 786         return today - datetime.timedelta(days=1)
 787     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 788     if match is not None:
 789         sign = match.group('sign')
 790         time = int(match.group('time'))
 791         if sign == '-':
 792             time = -time
 793         unit = match.group('unit')
 794         # A bad aproximation?
 795         if unit == 'month':
 796             unit = 'day'
 797             time *= 30
 798         elif unit == 'year':
 799             unit = 'day'
 800             time *= 365
 801         unit += 's'
 802         delta = datetime.timedelta(**{unit: time})
 803         return today + delta
 804     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 805
 806
 807 def hyphenate_date(date_str):
 808     """
 809     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 810     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 811     if match is not None:
 812         return '-'.join(match.groups())
 813     else:
 814         return date_str
 815
 816
 817 class DateRange(object):
 818     """Represents a time interval between two dates"""
 819
 820     def __init__(self, start=None, end=None):
 821         """start and end must be strings in the format accepted by date"""
 822         if start is not None:
 823             self.start = date_from_str(start)
 824         else:
 825             self.start = datetime.datetime.min.date()
 826         if end is not None:
 827             self.end = date_from_str(end)
 828         else:
 829             self.end = datetime.datetime.max.date()
 830         if self.start > self.end:
 831             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 832
 833     @classmethod
 834     def day(cls, day):
 835         """Returns a range that only contains the given day"""
 836         return cls(day, day)
 837
 838     def __contains__(self, date):
 839         """Check if the date is in the range"""
 840         if not isinstance(date, datetime.date):
 841             date = date_from_str(date)
 842         return self.start <= date <= self.end
 843
 844     def __str__(self):
 845         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 846
 847
 848 def platform_name():
 849     """ Returns the platform name as a compat_str """
 850     res = platform.platform()
 851     if isinstance(res, bytes):
 852         res = res.decode(preferredencoding())
 853
 854     assert isinstance(res, compat_str)
 855     return res
 856
 857
 858 def _windows_write_string(s, out):
 859     """ Returns True if the string was written using special methods,
 860     False if it has yet to be written out."""
 861     # Adapted from http://stackoverflow.com/a/3259271/35070
 862
 863     import ctypes
 864     import ctypes.wintypes
 865
 866     WIN_OUTPUT_IDS = {
 867         1: -11,
 868         2: -12,
 869     }
 870
 871     try:
 872         fileno = out.fileno()
 873     except AttributeError:
 874         # If the output stream doesn't have a fileno, it's virtual
 875         return False
 876     if fileno not in WIN_OUTPUT_IDS:
 877         return False
 878
 879     GetStdHandle = ctypes.WINFUNCTYPE(
 880         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 881         (b"GetStdHandle", ctypes.windll.kernel32))
 882     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 883
 884     WriteConsoleW = ctypes.WINFUNCTYPE(
 885         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 886         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 887         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 888     written = ctypes.wintypes.DWORD(0)
 889
 890     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 891     FILE_TYPE_CHAR = 0x0002
 892     FILE_TYPE_REMOTE = 0x8000
 893     GetConsoleMode = ctypes.WINFUNCTYPE(
 894         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 895         ctypes.POINTER(ctypes.wintypes.DWORD))(
 896         (b"GetConsoleMode", ctypes.windll.kernel32))
 897     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 898
 899     def not_a_console(handle):
 900         if handle == INVALID_HANDLE_VALUE or handle is None:
 901             return True
 902         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 903                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 904
 905     if not_a_console(h):
 906         return False
 907
 908     def next_nonbmp_pos(s):
 909         try:
 910             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 911         except StopIteration:
 912             return len(s)
 913
 914     while s:
 915         count = min(next_nonbmp_pos(s), 1024)
 916
 917         ret = WriteConsoleW(
 918             h, s, count if count else 2, ctypes.byref(written), None)
 919         if ret == 0:
 920             raise OSError('Failed to write string')
 921         if not count:  # We just wrote a non-BMP character
 922             assert written.value == 2
 923             s = s[1:]
 924         else:
 925             assert written.value > 0
 926             s = s[written.value:]
 927     return True
 928
 929
 930 def write_string(s, out=None, encoding=None):
 931     if out is None:
 932         out = sys.stderr
 933     assert type(s) == compat_str
 934
 935     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 936         if _windows_write_string(s, out):
 937             return
 938
 939     if ('b' in getattr(out, 'mode', '') or
 940             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 941         byt = s.encode(encoding or preferredencoding(), 'ignore')
 942         out.write(byt)
 943     elif hasattr(out, 'buffer'):
 944         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 945         byt = s.encode(enc, 'ignore')
 946         out.buffer.write(byt)
 947     else:
 948         out.write(s)
 949     out.flush()
 950
 951
 952 def bytes_to_intlist(bs):
 953     if not bs:
 954         return []
 955     if isinstance(bs[0], int):  # Python 3
 956         return list(bs)
 957     else:
 958         return [ord(c) for c in bs]
 959
 960
 961 def intlist_to_bytes(xs):
 962     if not xs:
 963         return b''
 964     return struct_pack('%dB' % len(xs), *xs)
 965
 966
 967 # Cross-platform file locking
 968 if sys.platform == 'win32':
 969     import ctypes.wintypes
 970     import msvcrt
 971
 972     class OVERLAPPED(ctypes.Structure):
 973         _fields_ = [
 974             ('Internal', ctypes.wintypes.LPVOID),
 975             ('InternalHigh', ctypes.wintypes.LPVOID),
 976             ('Offset', ctypes.wintypes.DWORD),
 977             ('OffsetHigh', ctypes.wintypes.DWORD),
 978             ('hEvent', ctypes.wintypes.HANDLE),
 979         ]
 980
 981     kernel32 = ctypes.windll.kernel32
 982     LockFileEx = kernel32.LockFileEx
 983     LockFileEx.argtypes = [
 984         ctypes.wintypes.HANDLE,     # hFile
 985         ctypes.wintypes.DWORD,      # dwFlags
 986         ctypes.wintypes.DWORD,      # dwReserved
 987         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 988         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 989         ctypes.POINTER(OVERLAPPED)  # Overlapped
 990     ]
 991     LockFileEx.restype = ctypes.wintypes.BOOL
 992     UnlockFileEx = kernel32.UnlockFileEx
 993     UnlockFileEx.argtypes = [
 994         ctypes.wintypes.HANDLE,     # hFile
 995         ctypes.wintypes.DWORD,      # dwReserved
 996         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 997         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 998         ctypes.POINTER(OVERLAPPED)  # Overlapped
 999     ]
1000     UnlockFileEx.restype = ctypes.wintypes.BOOL
1001     whole_low = 0xffffffff
1002     whole_high = 0x7fffffff
1003
1004     def _lock_file(f, exclusive):
1005         overlapped = OVERLAPPED()
1006         overlapped.Offset = 0
1007         overlapped.OffsetHigh = 0
1008         overlapped.hEvent = 0
1009         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1010         handle = msvcrt.get_osfhandle(f.fileno())
1011         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1012                           whole_low, whole_high, f._lock_file_overlapped_p):
1013             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1014
1015     def _unlock_file(f):
1016         assert f._lock_file_overlapped_p
1017         handle = msvcrt.get_osfhandle(f.fileno())
1018         if not UnlockFileEx(handle, 0,
1019                             whole_low, whole_high, f._lock_file_overlapped_p):
1020             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1021
1022 else:
1023     import fcntl
1024
1025     def _lock_file(f, exclusive):
1026         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1027
1028     def _unlock_file(f):
1029         fcntl.flock(f, fcntl.LOCK_UN)
1030
1031
1032 class locked_file(object):
1033     def __init__(self, filename, mode, encoding=None):
1034         assert mode in ['r', 'a', 'w']
1035         self.f = io.open(filename, mode, encoding=encoding)
1036         self.mode = mode
1037
1038     def __enter__(self):
1039         exclusive = self.mode != 'r'
1040         try:
1041             _lock_file(self.f, exclusive)
1042         except IOError:
1043             self.f.close()
1044             raise
1045         return self
1046
1047     def __exit__(self, etype, value, traceback):
1048         try:
1049             _unlock_file(self.f)
1050         finally:
1051             self.f.close()
1052
1053     def __iter__(self):
1054         return iter(self.f)
1055
1056     def write(self, *args):
1057         return self.f.write(*args)
1058
1059     def read(self, *args):
1060         return self.f.read(*args)
1061
1062
1063 def get_filesystem_encoding():
1064     encoding = sys.getfilesystemencoding()
1065     return encoding if encoding is not None else 'utf-8'
1066
1067
1068 def shell_quote(args):
1069     quoted_args = []
1070     encoding = get_filesystem_encoding()
1071     for a in args:
1072         if isinstance(a, bytes):
1073             # We may get a filename encoded with 'encodeFilename'
1074             a = a.decode(encoding)
1075         quoted_args.append(pipes.quote(a))
1076     return ' '.join(quoted_args)
1077
1078
1079 def takewhile_inclusive(pred, seq):
1080     """ Like itertools.takewhile, but include the latest evaluated element
1081         (the first element so that Not pred(e)) """
1082     for e in seq:
1083         yield e
1084         if not pred(e):
1085             return
1086
1087
1088 def smuggle_url(url, data):
1089     """ Pass additional data in a URL for internal use. """
1090
1091     sdata = compat_urllib_parse.urlencode(
1092         {'__youtubedl_smuggle': json.dumps(data)})
1093     return url + '#' + sdata
1094
1095
1096 def unsmuggle_url(smug_url, default=None):
1097     if '#__youtubedl_smuggle' not in smug_url:
1098         return smug_url, default
1099     url, _, sdata = smug_url.rpartition('#')
1100     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1101     data = json.loads(jsond)
1102     return url, data
1103
1104
1105 def format_bytes(bytes):
1106     if bytes is None:
1107         return 'N/A'
1108     if type(bytes) is str:
1109         bytes = float(bytes)
1110     if bytes == 0.0:
1111         exponent = 0
1112     else:
1113         exponent = int(math.log(bytes, 1024.0))
1114     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1115     converted = float(bytes) / float(1024 ** exponent)
1116     return '%.2f%s' % (converted, suffix)
1117
1118
1119 def parse_filesize(s):
1120     if s is None:
1121         return None
1122
1123     # The lower-case forms are of course incorrect and inofficial,
1124     # but we support those too
1125     _UNIT_TABLE = {
1126         'B': 1,
1127         'b': 1,
1128         'KiB': 1024,
1129         'KB': 1000,
1130         'kB': 1024,
1131         'Kb': 1000,
1132         'MiB': 1024 ** 2,
1133         'MB': 1000 ** 2,
1134         'mB': 1024 ** 2,
1135         'Mb': 1000 ** 2,
1136         'GiB': 1024 ** 3,
1137         'GB': 1000 ** 3,
1138         'gB': 1024 ** 3,
1139         'Gb': 1000 ** 3,
1140         'TiB': 1024 ** 4,
1141         'TB': 1000 ** 4,
1142         'tB': 1024 ** 4,
1143         'Tb': 1000 ** 4,
1144         'PiB': 1024 ** 5,
1145         'PB': 1000 ** 5,
1146         'pB': 1024 ** 5,
1147         'Pb': 1000 ** 5,
1148         'EiB': 1024 ** 6,
1149         'EB': 1000 ** 6,
1150         'eB': 1024 ** 6,
1151         'Eb': 1000 ** 6,
1152         'ZiB': 1024 ** 7,
1153         'ZB': 1000 ** 7,
1154         'zB': 1024 ** 7,
1155         'Zb': 1000 ** 7,
1156         'YiB': 1024 ** 8,
1157         'YB': 1000 ** 8,
1158         'yB': 1024 ** 8,
1159         'Yb': 1000 ** 8,
1160     }
1161
1162     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1163     m = re.match(
1164         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1165     if not m:
1166         return None
1167
1168     num_str = m.group('num').replace(',', '.')
1169     mult = _UNIT_TABLE[m.group('unit')]
1170     return int(float(num_str) * mult)
1171
1172
1173 def get_term_width():
1174     columns = compat_getenv('COLUMNS', None)
1175     if columns:
1176         return int(columns)
1177
1178     try:
1179         sp = subprocess.Popen(
1180             ['stty', 'size'],
1181             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1182         out, err = sp.communicate()
1183         return int(out.split()[1])
1184     except:
1185         pass
1186     return None
1187
1188
1189 def month_by_name(name):
1190     """ Return the number of a month by (locale-independently) English name """
1191
1192     ENGLISH_NAMES = [
1193         'January', 'February', 'March', 'April', 'May', 'June',
1194         'July', 'August', 'September', 'October', 'November', 'December']
1195     try:
1196         return ENGLISH_NAMES.index(name) + 1
1197     except ValueError:
1198         return None
1199
1200
1201 def fix_xml_ampersands(xml_str):
1202     """Replace all the '&' by '&amp;' in XML"""
1203     return re.sub(
1204         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1205         '&amp;',
1206         xml_str)
1207
1208
1209 def setproctitle(title):
1210     assert isinstance(title, compat_str)
1211     try:
1212         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1213     except OSError:
1214         return
1215     title_bytes = title.encode('utf-8')
1216     buf = ctypes.create_string_buffer(len(title_bytes))
1217     buf.value = title_bytes
1218     try:
1219         libc.prctl(15, buf, 0, 0, 0)
1220     except AttributeError:
1221         return  # Strange libc, just skip this
1222
1223
1224 def remove_start(s, start):
1225     if s.startswith(start):
1226         return s[len(start):]
1227     return s
1228
1229
1230 def remove_end(s, end):
1231     if s.endswith(end):
1232         return s[:-len(end)]
1233     return s
1234
1235
1236 def url_basename(url):
1237     path = compat_urlparse.urlparse(url).path
1238     return path.strip('/').split('/')[-1]
1239
1240
1241 class HEADRequest(compat_urllib_request.Request):
1242     def get_method(self):
1243         return "HEAD"
1244
1245
1246 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1247     if get_attr:
1248         if v is not None:
1249             v = getattr(v, get_attr, None)
1250     if v == '':
1251         v = None
1252     return default if v is None else (int(v) * invscale // scale)
1253
1254
1255 def str_or_none(v, default=None):
1256     return default if v is None else compat_str(v)
1257
1258
1259 def str_to_int(int_str):
1260     """ A more relaxed version of int_or_none """
1261     if int_str is None:
1262         return None
1263     int_str = re.sub(r'[,\.\+]', '', int_str)
1264     return int(int_str)
1265
1266
1267 def float_or_none(v, scale=1, invscale=1, default=None):
1268     return default if v is None else (float(v) * invscale / scale)
1269
1270
1271 def parse_duration(s):
1272     if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1273         return None
1274
1275     s = s.strip()
1276
1277     m = re.match(
1278         r'''(?ix)T?
1279         (?:
1280             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1281             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1282
1283             (?:
1284                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1285                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1286             )?
1287             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1288         )$''', s)
1289     if not m:
1290         return None
1291     res = 0
1292     if m.group('only_mins'):
1293         return float_or_none(m.group('only_mins'), invscale=60)
1294     if m.group('only_hours'):
1295         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1296     if m.group('secs'):
1297         res += int(m.group('secs'))
1298     if m.group('mins'):
1299         res += int(m.group('mins')) * 60
1300     if m.group('hours'):
1301         res += int(m.group('hours')) * 60 * 60
1302     if m.group('ms'):
1303         res += float(m.group('ms'))
1304     return res
1305
1306
1307 def prepend_extension(filename, ext):
1308     name, real_ext = os.path.splitext(filename)
1309     return '{0}.{1}{2}'.format(name, ext, real_ext)
1310
1311
1312 def check_executable(exe, args=[]):
1313     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1314     args can be a list of arguments for a short output (like -version) """
1315     try:
1316         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1317     except OSError:
1318         return False
1319     return exe
1320
1321
1322 def get_exe_version(exe, args=['--version'],
1323                     version_re=None, unrecognized='present'):
1324     """ Returns the version of the specified executable,
1325     or False if the executable is not present """
1326     try:
1327         out, _ = subprocess.Popen(
1328             [exe] + args,
1329             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1330     except OSError:
1331         return False
1332     if isinstance(out, bytes):  # Python 2.x
1333         out = out.decode('ascii', 'ignore')
1334     return detect_exe_version(out, version_re, unrecognized)
1335
1336
1337 def detect_exe_version(output, version_re=None, unrecognized='present'):
1338     assert isinstance(output, compat_str)
1339     if version_re is None:
1340         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1341     m = re.search(version_re, output)
1342     if m:
1343         return m.group(1)
1344     else:
1345         return unrecognized
1346
1347
1348 class PagedList(object):
1349     def __len__(self):
1350         # This is only useful for tests
1351         return len(self.getslice())
1352
1353
1354 class OnDemandPagedList(PagedList):
1355     def __init__(self, pagefunc, pagesize):
1356         self._pagefunc = pagefunc
1357         self._pagesize = pagesize
1358
1359     def getslice(self, start=0, end=None):
1360         res = []
1361         for pagenum in itertools.count(start // self._pagesize):
1362             firstid = pagenum * self._pagesize
1363             nextfirstid = pagenum * self._pagesize + self._pagesize
1364             if start >= nextfirstid:
1365                 continue
1366
1367             page_results = list(self._pagefunc(pagenum))
1368
1369             startv = (
1370                 start % self._pagesize
1371                 if firstid <= start < nextfirstid
1372                 else 0)
1373
1374             endv = (
1375                 ((end - 1) % self._pagesize) + 1
1376                 if (end is not None and firstid <= end <= nextfirstid)
1377                 else None)
1378
1379             if startv != 0 or endv is not None:
1380                 page_results = page_results[startv:endv]
1381             res.extend(page_results)
1382
1383             # A little optimization - if current page is not "full", ie. does
1384             # not contain page_size videos then we can assume that this page
1385             # is the last one - there are no more ids on further pages -
1386             # i.e. no need to query again.
1387             if len(page_results) + startv < self._pagesize:
1388                 break
1389
1390             # If we got the whole page, but the next page is not interesting,
1391             # break out early as well
1392             if end == nextfirstid:
1393                 break
1394         return res
1395
1396
1397 class InAdvancePagedList(PagedList):
1398     def __init__(self, pagefunc, pagecount, pagesize):
1399         self._pagefunc = pagefunc
1400         self._pagecount = pagecount
1401         self._pagesize = pagesize
1402
1403     def getslice(self, start=0, end=None):
1404         res = []
1405         start_page = start // self._pagesize
1406         end_page = (
1407             self._pagecount if end is None else (end // self._pagesize + 1))
1408         skip_elems = start - start_page * self._pagesize
1409         only_more = None if end is None else end - start
1410         for pagenum in range(start_page, end_page):
1411             page = list(self._pagefunc(pagenum))
1412             if skip_elems:
1413                 page = page[skip_elems:]
1414                 skip_elems = None
1415             if only_more is not None:
1416                 if len(page) < only_more:
1417                     only_more -= len(page)
1418                 else:
1419                     page = page[:only_more]
1420                     res.extend(page)
1421                     break
1422             res.extend(page)
1423         return res
1424
1425
1426 def uppercase_escape(s):
1427     unicode_escape = codecs.getdecoder('unicode_escape')
1428     return re.sub(
1429         r'\\U[0-9a-fA-F]{8}',
1430         lambda m: unicode_escape(m.group(0))[0],
1431         s)
1432
1433
1434 def escape_rfc3986(s):
1435     """Escape non-ASCII characters as suggested by RFC 3986"""
1436     if sys.version_info < (3, 0) and isinstance(s, unicode):
1437         s = s.encode('utf-8')
1438     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1439
1440
1441 def escape_url(url):
1442     """Escape URL as suggested by RFC 3986"""
1443     url_parsed = compat_urllib_parse_urlparse(url)
1444     return url_parsed._replace(
1445         path=escape_rfc3986(url_parsed.path),
1446         params=escape_rfc3986(url_parsed.params),
1447         query=escape_rfc3986(url_parsed.query),
1448         fragment=escape_rfc3986(url_parsed.fragment)
1449     ).geturl()
1450
1451 try:
1452     struct.pack('!I', 0)
1453 except TypeError:
1454     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1455     def struct_pack(spec, *args):
1456         if isinstance(spec, compat_str):
1457             spec = spec.encode('ascii')
1458         return struct.pack(spec, *args)
1459
1460     def struct_unpack(spec, *args):
1461         if isinstance(spec, compat_str):
1462             spec = spec.encode('ascii')
1463         return struct.unpack(spec, *args)
1464 else:
1465     struct_pack = struct.pack
1466     struct_unpack = struct.unpack
1467
1468
1469 def read_batch_urls(batch_fd):
1470     def fixup(url):
1471         if not isinstance(url, compat_str):
1472             url = url.decode('utf-8', 'replace')
1473         BOM_UTF8 = '\xef\xbb\xbf'
1474         if url.startswith(BOM_UTF8):
1475             url = url[len(BOM_UTF8):]
1476         url = url.strip()
1477         if url.startswith(('#', ';', ']')):
1478             return False
1479         return url
1480
1481     with contextlib.closing(batch_fd) as fd:
1482         return [url for url in map(fixup, fd) if url]
1483
1484
1485 def urlencode_postdata(*args, **kargs):
1486     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1487
1488
1489 try:
1490     etree_iter = xml.etree.ElementTree.Element.iter
1491 except AttributeError:  # Python <=2.6
1492     etree_iter = lambda n: n.findall('.//*')
1493
1494
1495 def parse_xml(s):
1496     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1497         def doctype(self, name, pubid, system):
1498             pass  # Ignore doctypes
1499
1500     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1501     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1502     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1503     # Fix up XML parser in Python 2.x
1504     if sys.version_info < (3, 0):
1505         for n in etree_iter(tree):
1506             if n.text is not None:
1507                 if not isinstance(n.text, compat_str):
1508                     n.text = n.text.decode('utf-8')
1509     return tree
1510
1511
1512 US_RATINGS = {
1513     'G': 0,
1514     'PG': 10,
1515     'PG-13': 13,
1516     'R': 16,
1517     'NC': 18,
1518 }
1519
1520
1521 def parse_age_limit(s):
1522     if s is None:
1523         return None
1524     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1525     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1526
1527
1528 def strip_jsonp(code):
1529     return re.sub(
1530         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1531
1532
1533 def js_to_json(code):
1534     def fix_kv(m):
1535         v = m.group(0)
1536         if v in ('true', 'false', 'null'):
1537             return v
1538         if v.startswith('"'):
1539             return v
1540         if v.startswith("'"):
1541             v = v[1:-1]
1542             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1543                 '\\\\': '\\\\',
1544                 "\\'": "'",
1545                 '"': '\\"',
1546             }[m.group(0)], v)
1547         return '"%s"' % v
1548
1549     res = re.sub(r'''(?x)
1550         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1551         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1552         [a-zA-Z_][a-zA-Z_0-9]*
1553         ''', fix_kv, code)
1554     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1555     return res
1556
1557
1558 def qualities(quality_ids):
1559     """ Get a numeric quality value out of a list of possible values """
1560     def q(qid):
1561         try:
1562             return quality_ids.index(qid)
1563         except ValueError:
1564             return -1
1565     return q
1566
1567
1568 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1569
1570
1571 def limit_length(s, length):
1572     """ Add ellipses to overly long strings """
1573     if s is None:
1574         return None
1575     ELLIPSES = '...'
1576     if len(s) > length:
1577         return s[:length - len(ELLIPSES)] + ELLIPSES
1578     return s
1579
1580
1581 def version_tuple(v):
1582     return tuple(int(e) for e in re.split(r'[-.]', v))
1583
1584
1585 def is_outdated_version(version, limit, assume_new=True):
1586     if not version:
1587         return not assume_new
1588     try:
1589         return version_tuple(version) < version_tuple(limit)
1590     except ValueError:
1591         return not assume_new
1592
1593
1594 def ytdl_is_updateable():
1595     """ Returns if youtube-dl can be updated with -U """
1596     from zipimport import zipimporter
1597
1598     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1599
1600
1601 def args_to_str(args):
1602     # Get a short string representation for a subprocess command
1603     return ' '.join(shlex_quote(a) for a in args)
1604
1605
1606 def urlhandle_detect_ext(url_handle):
1607     try:
1608         url_handle.headers
1609         getheader = lambda h: url_handle.headers[h]
1610     except AttributeError:  # Python < 3
1611         getheader = url_handle.info().getheader
1612
1613     return getheader('Content-Type').split("/")[1]
1614
1615
1616 def age_restricted(content_limit, age_limit):
1617     """ Returns True iff the content should be blocked """
1618
1619     if age_limit is None:  # No limit set
1620         return False
1621     if content_limit is None:
1622         return False  # Content available for everyone
1623     return age_limit < content_limit