youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import functools
  14 import gzip
  15 import itertools
  16 import io
  17 import json
  18 import locale
  19 import math
  20 import operator
  21 import os
  22 import pipes
  23 import platform
  24 import re
  25 import ssl
  26 import socket
  27 import struct
  28 import subprocess
  29 import sys
  30 import tempfile
  31 import traceback
  32 import xml.etree.ElementTree
  33 import zlib
  34
  35 from .compat import (
  36     compat_basestring,
  37     compat_chr,
  38     compat_html_entities,
  39     compat_http_client,
  40     compat_parse_qs,
  41     compat_socket_create_connection,
  42     compat_str,
  43     compat_urllib_error,
  44     compat_urllib_parse,
  45     compat_urllib_parse_urlparse,
  46     compat_urllib_request,
  47     compat_urlparse,
  48     shlex_quote,
  49 )
  50
  51
  52 # This is not clearly defined otherwise
  53 compiled_regex_type = type(re.compile(''))
  54
  55 std_headers = {
  56     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  57     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59     'Accept-Encoding': 'gzip, deflate',
  60     'Accept-Language': 'en-us,en;q=0.5',
  61 }
  62
  63
  64 ENGLISH_MONTH_NAMES = [
  65     'January', 'February', 'March', 'April', 'May', 'June',
  66     'July', 'August', 'September', 'October', 'November', 'December']
  67
  68
  69 def preferredencoding():
  70     """Get preferred encoding.
  71
  72     Returns the best encoding scheme for the system, based on
  73     locale.getpreferredencoding() and some further tweaks.
  74     """
  75     try:
  76         pref = locale.getpreferredencoding()
  77         'TEST'.encode(pref)
  78     except:
  79         pref = 'UTF-8'
  80
  81     return pref
  82
  83
  84 def write_json_file(obj, fn):
  85     """ Encode obj as JSON and write it to fn, atomically if possible """
  86
  87     fn = encodeFilename(fn)
  88     if sys.version_info < (3, 0) and sys.platform != 'win32':
  89         encoding = get_filesystem_encoding()
  90         # os.path.basename returns a bytes object, but NamedTemporaryFile
  91         # will fail if the filename contains non ascii characters unless we
  92         # use a unicode object
  93         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  94         # the same for os.path.dirname
  95         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  96     else:
  97         path_basename = os.path.basename
  98         path_dirname = os.path.dirname
  99
 100     args = {
 101         'suffix': '.tmp',
 102         'prefix': path_basename(fn) + '.',
 103         'dir': path_dirname(fn),
 104         'delete': False,
 105     }
 106
 107     # In Python 2.x, json.dump expects a bytestream.
 108     # In Python 3.x, it writes to a character stream
 109     if sys.version_info < (3, 0):
 110         args['mode'] = 'wb'
 111     else:
 112         args.update({
 113             'mode': 'w',
 114             'encoding': 'utf-8',
 115         })
 116
 117     tf = tempfile.NamedTemporaryFile(**args)
 118
 119     try:
 120         with tf:
 121             json.dump(obj, tf)
 122         if sys.platform == 'win32':
 123             # Need to remove existing file on Windows, else os.rename raises
 124             # WindowsError or FileExistsError.
 125             try:
 126                 os.unlink(fn)
 127             except OSError:
 128                 pass
 129         os.rename(tf.name, fn)
 130     except:
 131         try:
 132             os.remove(tf.name)
 133         except OSError:
 134             pass
 135         raise
 136
 137
 138 if sys.version_info >= (2, 7):
 139     def find_xpath_attr(node, xpath, key, val):
 140         """ Find the xpath xpath[@key=val] """
 141         assert re.match(r'^[a-zA-Z-]+$', key)
 142         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 143         expr = xpath + "[@%s='%s']" % (key, val)
 144         return node.find(expr)
 145 else:
 146     def find_xpath_attr(node, xpath, key, val):
 147         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 148         # .//node does not match if a node is a direct child of . !
 149         if isinstance(xpath, compat_str):
 150             xpath = xpath.encode('ascii')
 151
 152         for f in node.findall(xpath):
 153             if f.attrib.get(key) == val:
 154                 return f
 155         return None
 156
 157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 158 # the namespace parameter
 159
 160
 161 def xpath_with_ns(path, ns_map):
 162     components = [c.split(':') for c in path.split('/')]
 163     replaced = []
 164     for c in components:
 165         if len(c) == 1:
 166             replaced.append(c[0])
 167         else:
 168             ns, tag = c
 169             replaced.append('{%s}%s' % (ns_map[ns], tag))
 170     return '/'.join(replaced)
 171
 172
 173 def xpath_text(node, xpath, name=None, fatal=False):
 174     if sys.version_info < (2, 7):  # Crazy 2.6
 175         xpath = xpath.encode('ascii')
 176
 177     n = node.find(xpath)
 178     if n is None or n.text is None:
 179         if fatal:
 180             name = xpath if name is None else name
 181             raise ExtractorError('Could not find XML element %s' % name)
 182         else:
 183             return None
 184     return n.text
 185
 186
 187 def get_element_by_id(id, html):
 188     """Return the content of the tag with the specified ID in the passed HTML document"""
 189     return get_element_by_attribute("id", id, html)
 190
 191
 192 def get_element_by_attribute(attribute, value, html):
 193     """Return the content of the tag with the specified attribute in the passed HTML document"""
 194
 195     m = re.search(r'''(?xs)
 196         <([a-zA-Z0-9:._-]+)
 197          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 198          \s+%s=['"]?%s['"]?
 199          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 200         \s*>
 201         (?P<content>.*?)
 202         </\1>
 203     ''' % (re.escape(attribute), re.escape(value)), html)
 204
 205     if not m:
 206         return None
 207     res = m.group('content')
 208
 209     if res.startswith('"') or res.startswith("'"):
 210         res = res[1:-1]
 211
 212     return unescapeHTML(res)
 213
 214
 215 def clean_html(html):
 216     """Clean an HTML snippet into a readable string"""
 217
 218     if html is None:  # Convenience for sanitizing descriptions etc.
 219         return html
 220
 221     # Newline vs <br />
 222     html = html.replace('\n', ' ')
 223     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 224     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 225     # Strip html tags
 226     html = re.sub('<.*?>', '', html)
 227     # Replace html entities
 228     html = unescapeHTML(html)
 229     return html.strip()
 230
 231
 232 def sanitize_open(filename, open_mode):
 233     """Try to open the given filename, and slightly tweak it if this fails.
 234
 235     Attempts to open the given filename. If this fails, it tries to change
 236     the filename slightly, step by step, until it's either able to open it
 237     or it fails and raises a final exception, like the standard open()
 238     function.
 239
 240     It returns the tuple (stream, definitive_file_name).
 241     """
 242     try:
 243         if filename == '-':
 244             if sys.platform == 'win32':
 245                 import msvcrt
 246                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 247             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 248         stream = open(encodeFilename(filename), open_mode)
 249         return (stream, filename)
 250     except (IOError, OSError) as err:
 251         if err.errno in (errno.EACCES,):
 252             raise
 253
 254         # In case of error, try to remove win32 forbidden chars
 255         alt_filename = os.path.join(
 256             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 257             for path_part in os.path.split(filename)
 258         )
 259         if alt_filename == filename:
 260             raise
 261         else:
 262             # An exception here should be caught in the caller
 263             stream = open(encodeFilename(filename), open_mode)
 264             return (stream, alt_filename)
 265
 266
 267 def timeconvert(timestr):
 268     """Convert RFC 2822 defined time string into system timestamp"""
 269     timestamp = None
 270     timetuple = email.utils.parsedate_tz(timestr)
 271     if timetuple is not None:
 272         timestamp = email.utils.mktime_tz(timetuple)
 273     return timestamp
 274
 275
 276 def sanitize_filename(s, restricted=False, is_id=False):
 277     """Sanitizes a string so it could be used as part of a filename.
 278     If restricted is set, use a stricter subset of allowed characters.
 279     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 280     """
 281     def replace_insane(char):
 282         if char == '?' or ord(char) < 32 or ord(char) == 127:
 283             return ''
 284         elif char == '"':
 285             return '' if restricted else '\''
 286         elif char == ':':
 287             return '_-' if restricted else ' -'
 288         elif char in '\\/|*<>':
 289             return '_'
 290         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 291             return '_'
 292         if restricted and ord(char) > 127:
 293             return '_'
 294         return char
 295
 296     # Handle timestamps
 297     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 298     result = ''.join(map(replace_insane, s))
 299     if not is_id:
 300         while '__' in result:
 301             result = result.replace('__', '_')
 302         result = result.strip('_')
 303         # Common case of "Foreign band name - English song title"
 304         if restricted and result.startswith('-_'):
 305             result = result[2:]
 306         if result.startswith('-'):
 307             result = '_' + result[len('-'):]
 308         result = result.lstrip('.')
 309         if not result:
 310             result = '_'
 311     return result
 312
 313
 314 def orderedSet(iterable):
 315     """ Remove all duplicates from the input iterable """
 316     res = []
 317     for el in iterable:
 318         if el not in res:
 319             res.append(el)
 320     return res
 321
 322
 323 def _htmlentity_transform(entity):
 324     """Transforms an HTML entity to a character."""
 325     # Known non-numeric HTML entity
 326     if entity in compat_html_entities.name2codepoint:
 327         return compat_chr(compat_html_entities.name2codepoint[entity])
 328
 329     mobj = re.match(r'#(x?[0-9]+)', entity)
 330     if mobj is not None:
 331         numstr = mobj.group(1)
 332         if numstr.startswith('x'):
 333             base = 16
 334             numstr = '0%s' % numstr
 335         else:
 336             base = 10
 337         return compat_chr(int(numstr, base))
 338
 339     # Unknown entity in name, return its literal representation
 340     return ('&%s;' % entity)
 341
 342
 343 def unescapeHTML(s):
 344     if s is None:
 345         return None
 346     assert type(s) == compat_str
 347
 348     return re.sub(
 349         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 350
 351
 352 def encodeFilename(s, for_subprocess=False):
 353     """
 354     @param s The name of the file
 355     """
 356
 357     assert type(s) == compat_str
 358
 359     # Python 3 has a Unicode API
 360     if sys.version_info >= (3, 0):
 361         return s
 362
 363     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 364         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 365         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 366         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 367         if not for_subprocess:
 368             return s
 369         else:
 370             # For subprocess calls, encode with locale encoding
 371             # Refer to http://stackoverflow.com/a/9951851/35070
 372             encoding = preferredencoding()
 373     else:
 374         encoding = sys.getfilesystemencoding()
 375     if encoding is None:
 376         encoding = 'utf-8'
 377     return s.encode(encoding, 'ignore')
 378
 379
 380 def encodeArgument(s):
 381     if not isinstance(s, compat_str):
 382         # Legacy code that uses byte strings
 383         # Uncomment the following line after fixing all post processors
 384         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 385         s = s.decode('ascii')
 386     return encodeFilename(s, True)
 387
 388
 389 def decodeOption(optval):
 390     if optval is None:
 391         return optval
 392     if isinstance(optval, bytes):
 393         optval = optval.decode(preferredencoding())
 394
 395     assert isinstance(optval, compat_str)
 396     return optval
 397
 398
 399 def formatSeconds(secs):
 400     if secs > 3600:
 401         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 402     elif secs > 60:
 403         return '%d:%02d' % (secs // 60, secs % 60)
 404     else:
 405         return '%d' % secs
 406
 407
 408 def make_HTTPS_handler(params, **kwargs):
 409     opts_no_check_certificate = params.get('nocheckcertificate', False)
 410     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 411         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 412         if opts_no_check_certificate:
 413             context.check_hostname = False
 414             context.verify_mode = ssl.CERT_NONE
 415         try:
 416             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 417         except TypeError:
 418             # Python 2.7.8
 419             # (create_default_context present but HTTPSHandler has no context=)
 420             pass
 421
 422     if sys.version_info < (3, 2):
 423         return YoutubeDLHTTPSHandler(params, **kwargs)
 424     else:  # Python < 3.4
 425         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 426         context.verify_mode = (ssl.CERT_NONE
 427                                if opts_no_check_certificate
 428                                else ssl.CERT_REQUIRED)
 429         context.set_default_verify_paths()
 430         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 431
 432
 433 class ExtractorError(Exception):
 434     """Error during info extraction."""
 435
 436     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 437         """ tb, if given, is the original traceback (so that it can be printed out).
 438         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 439         """
 440
 441         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 442             expected = True
 443         if video_id is not None:
 444             msg = video_id + ': ' + msg
 445         if cause:
 446             msg += ' (caused by %r)' % cause
 447         if not expected:
 448             if ytdl_is_updateable():
 449                 update_cmd = 'type  youtube-dl -U  to update'
 450             else:
 451                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 452             msg += '; please report this issue on https://yt-dl.org/bug .'
 453             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 454             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 455         super(ExtractorError, self).__init__(msg)
 456
 457         self.traceback = tb
 458         self.exc_info = sys.exc_info()  # preserve original exception
 459         self.cause = cause
 460         self.video_id = video_id
 461
 462     def format_traceback(self):
 463         if self.traceback is None:
 464             return None
 465         return ''.join(traceback.format_tb(self.traceback))
 466
 467
 468 class UnsupportedError(ExtractorError):
 469     def __init__(self, url):
 470         super(UnsupportedError, self).__init__(
 471             'Unsupported URL: %s' % url, expected=True)
 472         self.url = url
 473
 474
 475 class RegexNotFoundError(ExtractorError):
 476     """Error when a regex didn't match"""
 477     pass
 478
 479
 480 class DownloadError(Exception):
 481     """Download Error exception.
 482
 483     This exception may be thrown by FileDownloader objects if they are not
 484     configured to continue on errors. They will contain the appropriate
 485     error message.
 486     """
 487
 488     def __init__(self, msg, exc_info=None):
 489         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 490         super(DownloadError, self).__init__(msg)
 491         self.exc_info = exc_info
 492
 493
 494 class SameFileError(Exception):
 495     """Same File exception.
 496
 497     This exception will be thrown by FileDownloader objects if they detect
 498     multiple files would have to be downloaded to the same file on disk.
 499     """
 500     pass
 501
 502
 503 class PostProcessingError(Exception):
 504     """Post Processing exception.
 505
 506     This exception may be raised by PostProcessor's .run() method to
 507     indicate an error in the postprocessing task.
 508     """
 509
 510     def __init__(self, msg):
 511         self.msg = msg
 512
 513
 514 class MaxDownloadsReached(Exception):
 515     """ --max-downloads limit has been reached. """
 516     pass
 517
 518
 519 class UnavailableVideoError(Exception):
 520     """Unavailable Format exception.
 521
 522     This exception will be thrown when a video is requested
 523     in a format that is not available for that video.
 524     """
 525     pass
 526
 527
 528 class ContentTooShortError(Exception):
 529     """Content Too Short exception.
 530
 531     This exception may be raised by FileDownloader objects when a file they
 532     download is too small for what the server announced first, indicating
 533     the connection was probably interrupted.
 534     """
 535     # Both in bytes
 536     downloaded = None
 537     expected = None
 538
 539     def __init__(self, downloaded, expected):
 540         self.downloaded = downloaded
 541         self.expected = expected
 542
 543
 544 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 545     hc = http_class(*args, **kwargs)
 546     source_address = ydl_handler._params.get('source_address')
 547     if source_address is not None:
 548         sa = (source_address, 0)
 549         if hasattr(hc, 'source_address'):  # Python 2.7+
 550             hc.source_address = sa
 551         else:  # Python 2.6
 552             def _hc_connect(self, *args, **kwargs):
 553                 sock = compat_socket_create_connection(
 554                     (self.host, self.port), self.timeout, sa)
 555                 if is_https:
 556                     self.sock = ssl.wrap_socket(
 557                         sock, self.key_file, self.cert_file,
 558                         ssl_version=ssl.PROTOCOL_TLSv1)
 559                 else:
 560                     self.sock = sock
 561             hc.connect = functools.partial(_hc_connect, hc)
 562
 563     return hc
 564
 565
 566 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 567     """Handler for HTTP requests and responses.
 568
 569     This class, when installed with an OpenerDirector, automatically adds
 570     the standard headers to every HTTP request and handles gzipped and
 571     deflated responses from web servers. If compression is to be avoided in
 572     a particular request, the original request in the program code only has
 573     to include the HTTP header "Youtubedl-No-Compression", which will be
 574     removed before making the real request.
 575
 576     Part of this code was copied from:
 577
 578     http://techknack.net/python-urllib2-handlers/
 579
 580     Andrew Rowls, the author of that code, agreed to release it to the
 581     public domain.
 582     """
 583
 584     def __init__(self, params, *args, **kwargs):
 585         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 586         self._params = params
 587
 588     def http_open(self, req):
 589         return self.do_open(functools.partial(
 590             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 591             req)
 592
 593     @staticmethod
 594     def deflate(data):
 595         try:
 596             return zlib.decompress(data, -zlib.MAX_WBITS)
 597         except zlib.error:
 598             return zlib.decompress(data)
 599
 600     @staticmethod
 601     def addinfourl_wrapper(stream, headers, url, code):
 602         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 603             return compat_urllib_request.addinfourl(stream, headers, url, code)
 604         ret = compat_urllib_request.addinfourl(stream, headers, url)
 605         ret.code = code
 606         return ret
 607
 608     def http_request(self, req):
 609         for h, v in std_headers.items():
 610             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 611             # The dict keys are capitalized because of this bug by urllib
 612             if h.capitalize() not in req.headers:
 613                 req.add_header(h, v)
 614         if 'Youtubedl-no-compression' in req.headers:
 615             if 'Accept-encoding' in req.headers:
 616                 del req.headers['Accept-encoding']
 617             del req.headers['Youtubedl-no-compression']
 618
 619         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 620             # Python 2.6 is brain-dead when it comes to fragments
 621             req._Request__original = req._Request__original.partition('#')[0]
 622             req._Request__r_type = req._Request__r_type.partition('#')[0]
 623
 624         return req
 625
 626     def http_response(self, req, resp):
 627         old_resp = resp
 628         # gzip
 629         if resp.headers.get('Content-encoding', '') == 'gzip':
 630             content = resp.read()
 631             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 632             try:
 633                 uncompressed = io.BytesIO(gz.read())
 634             except IOError as original_ioerror:
 635                 # There may be junk add the end of the file
 636                 # See http://stackoverflow.com/q/4928560/35070 for details
 637                 for i in range(1, 1024):
 638                     try:
 639                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 640                         uncompressed = io.BytesIO(gz.read())
 641                     except IOError:
 642                         continue
 643                     break
 644                 else:
 645                     raise original_ioerror
 646             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 647             resp.msg = old_resp.msg
 648         # deflate
 649         if resp.headers.get('Content-encoding', '') == 'deflate':
 650             gz = io.BytesIO(self.deflate(resp.read()))
 651             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 652             resp.msg = old_resp.msg
 653         return resp
 654
 655     https_request = http_request
 656     https_response = http_response
 657
 658
 659 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 660     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 661         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 662         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 663         self._params = params
 664
 665     def https_open(self, req):
 666         kwargs = {}
 667         if hasattr(self, '_context'):  # python > 2.6
 668             kwargs['context'] = self._context
 669         if hasattr(self, '_check_hostname'):  # python 3.x
 670             kwargs['check_hostname'] = self._check_hostname
 671         return self.do_open(functools.partial(
 672             _create_http_connection, self, self._https_conn_class, True),
 673             req, **kwargs)
 674
 675
 676 def parse_iso8601(date_str, delimiter='T', timezone=None):
 677     """ Return a UNIX timestamp from the given date """
 678
 679     if date_str is None:
 680         return None
 681
 682     if timezone is None:
 683         m = re.search(
 684             r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 685             date_str)
 686         if not m:
 687             timezone = datetime.timedelta()
 688         else:
 689             date_str = date_str[:-len(m.group(0))]
 690             if not m.group('sign'):
 691                 timezone = datetime.timedelta()
 692             else:
 693                 sign = 1 if m.group('sign') == '+' else -1
 694                 timezone = datetime.timedelta(
 695                     hours=sign * int(m.group('hours')),
 696                     minutes=sign * int(m.group('minutes')))
 697     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 698     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 699     return calendar.timegm(dt.timetuple())
 700
 701
 702 def unified_strdate(date_str, day_first=True):
 703     """Return a string with the date in the format YYYYMMDD"""
 704
 705     if date_str is None:
 706         return None
 707     upload_date = None
 708     # Replace commas
 709     date_str = date_str.replace(',', ' ')
 710     # %z (UTC offset) is only supported in python>=3.2
 711     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 712     # Remove AM/PM + timezone
 713     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 714
 715     format_expressions = [
 716         '%d %B %Y',
 717         '%d %b %Y',
 718         '%B %d %Y',
 719         '%b %d %Y',
 720         '%b %dst %Y %I:%M%p',
 721         '%b %dnd %Y %I:%M%p',
 722         '%b %dth %Y %I:%M%p',
 723         '%Y %m %d',
 724         '%Y-%m-%d',
 725         '%Y/%m/%d',
 726         '%Y/%m/%d %H:%M:%S',
 727         '%Y-%m-%d %H:%M:%S',
 728         '%Y-%m-%d %H:%M:%S.%f',
 729         '%d.%m.%Y %H:%M',
 730         '%d.%m.%Y %H.%M',
 731         '%Y-%m-%dT%H:%M:%SZ',
 732         '%Y-%m-%dT%H:%M:%S.%fZ',
 733         '%Y-%m-%dT%H:%M:%S.%f0Z',
 734         '%Y-%m-%dT%H:%M:%S',
 735         '%Y-%m-%dT%H:%M:%S.%f',
 736         '%Y-%m-%dT%H:%M',
 737     ]
 738     if day_first:
 739         format_expressions.extend([
 740             '%d.%m.%Y',
 741             '%d/%m/%Y',
 742             '%d/%m/%y',
 743             '%d/%m/%Y %H:%M:%S',
 744         ])
 745     else:
 746         format_expressions.extend([
 747             '%m.%d.%Y',
 748             '%m/%d/%Y',
 749             '%m/%d/%y',
 750             '%m/%d/%Y %H:%M:%S',
 751         ])
 752     for expression in format_expressions:
 753         try:
 754             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 755         except ValueError:
 756             pass
 757     if upload_date is None:
 758         timetuple = email.utils.parsedate_tz(date_str)
 759         if timetuple:
 760             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 761     return upload_date
 762
 763
 764 def determine_ext(url, default_ext='unknown_video'):
 765     if url is None:
 766         return default_ext
 767     guess = url.partition('?')[0].rpartition('.')[2]
 768     if re.match(r'^[A-Za-z0-9]+$', guess):
 769         return guess
 770     else:
 771         return default_ext
 772
 773
 774 def subtitles_filename(filename, sub_lang, sub_format):
 775     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 776
 777
 778 def date_from_str(date_str):
 779     """
 780     Return a datetime object from a string in the format YYYYMMDD or
 781     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 782     today = datetime.date.today()
 783     if date_str in ('now', 'today'):
 784         return today
 785     if date_str == 'yesterday':
 786         return today - datetime.timedelta(days=1)
 787     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 788     if match is not None:
 789         sign = match.group('sign')
 790         time = int(match.group('time'))
 791         if sign == '-':
 792             time = -time
 793         unit = match.group('unit')
 794         # A bad aproximation?
 795         if unit == 'month':
 796             unit = 'day'
 797             time *= 30
 798         elif unit == 'year':
 799             unit = 'day'
 800             time *= 365
 801         unit += 's'
 802         delta = datetime.timedelta(**{unit: time})
 803         return today + delta
 804     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 805
 806
 807 def hyphenate_date(date_str):
 808     """
 809     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 810     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 811     if match is not None:
 812         return '-'.join(match.groups())
 813     else:
 814         return date_str
 815
 816
 817 class DateRange(object):
 818     """Represents a time interval between two dates"""
 819
 820     def __init__(self, start=None, end=None):
 821         """start and end must be strings in the format accepted by date"""
 822         if start is not None:
 823             self.start = date_from_str(start)
 824         else:
 825             self.start = datetime.datetime.min.date()
 826         if end is not None:
 827             self.end = date_from_str(end)
 828         else:
 829             self.end = datetime.datetime.max.date()
 830         if self.start > self.end:
 831             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 832
 833     @classmethod
 834     def day(cls, day):
 835         """Returns a range that only contains the given day"""
 836         return cls(day, day)
 837
 838     def __contains__(self, date):
 839         """Check if the date is in the range"""
 840         if not isinstance(date, datetime.date):
 841             date = date_from_str(date)
 842         return self.start <= date <= self.end
 843
 844     def __str__(self):
 845         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 846
 847
 848 def platform_name():
 849     """ Returns the platform name as a compat_str """
 850     res = platform.platform()
 851     if isinstance(res, bytes):
 852         res = res.decode(preferredencoding())
 853
 854     assert isinstance(res, compat_str)
 855     return res
 856
 857
 858 def _windows_write_string(s, out):
 859     """ Returns True if the string was written using special methods,
 860     False if it has yet to be written out."""
 861     # Adapted from http://stackoverflow.com/a/3259271/35070
 862
 863     import ctypes
 864     import ctypes.wintypes
 865
 866     WIN_OUTPUT_IDS = {
 867         1: -11,
 868         2: -12,
 869     }
 870
 871     try:
 872         fileno = out.fileno()
 873     except AttributeError:
 874         # If the output stream doesn't have a fileno, it's virtual
 875         return False
 876     except io.UnsupportedOperation:
 877         # Some strange Windows pseudo files?
 878         return False
 879     if fileno not in WIN_OUTPUT_IDS:
 880         return False
 881
 882     GetStdHandle = ctypes.WINFUNCTYPE(
 883         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 884         (b"GetStdHandle", ctypes.windll.kernel32))
 885     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 886
 887     WriteConsoleW = ctypes.WINFUNCTYPE(
 888         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 889         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 890         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 891     written = ctypes.wintypes.DWORD(0)
 892
 893     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 894     FILE_TYPE_CHAR = 0x0002
 895     FILE_TYPE_REMOTE = 0x8000
 896     GetConsoleMode = ctypes.WINFUNCTYPE(
 897         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 898         ctypes.POINTER(ctypes.wintypes.DWORD))(
 899         (b"GetConsoleMode", ctypes.windll.kernel32))
 900     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 901
 902     def not_a_console(handle):
 903         if handle == INVALID_HANDLE_VALUE or handle is None:
 904             return True
 905         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
 906                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 907
 908     if not_a_console(h):
 909         return False
 910
 911     def next_nonbmp_pos(s):
 912         try:
 913             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 914         except StopIteration:
 915             return len(s)
 916
 917     while s:
 918         count = min(next_nonbmp_pos(s), 1024)
 919
 920         ret = WriteConsoleW(
 921             h, s, count if count else 2, ctypes.byref(written), None)
 922         if ret == 0:
 923             raise OSError('Failed to write string')
 924         if not count:  # We just wrote a non-BMP character
 925             assert written.value == 2
 926             s = s[1:]
 927         else:
 928             assert written.value > 0
 929             s = s[written.value:]
 930     return True
 931
 932
 933 def write_string(s, out=None, encoding=None):
 934     if out is None:
 935         out = sys.stderr
 936     assert type(s) == compat_str
 937
 938     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 939         if _windows_write_string(s, out):
 940             return
 941
 942     if ('b' in getattr(out, 'mode', '') or
 943             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 944         byt = s.encode(encoding or preferredencoding(), 'ignore')
 945         out.write(byt)
 946     elif hasattr(out, 'buffer'):
 947         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 948         byt = s.encode(enc, 'ignore')
 949         out.buffer.write(byt)
 950     else:
 951         out.write(s)
 952     out.flush()
 953
 954
 955 def bytes_to_intlist(bs):
 956     if not bs:
 957         return []
 958     if isinstance(bs[0], int):  # Python 3
 959         return list(bs)
 960     else:
 961         return [ord(c) for c in bs]
 962
 963
 964 def intlist_to_bytes(xs):
 965     if not xs:
 966         return b''
 967     return struct_pack('%dB' % len(xs), *xs)
 968
 969
 970 # Cross-platform file locking
 971 if sys.platform == 'win32':
 972     import ctypes.wintypes
 973     import msvcrt
 974
 975     class OVERLAPPED(ctypes.Structure):
 976         _fields_ = [
 977             ('Internal', ctypes.wintypes.LPVOID),
 978             ('InternalHigh', ctypes.wintypes.LPVOID),
 979             ('Offset', ctypes.wintypes.DWORD),
 980             ('OffsetHigh', ctypes.wintypes.DWORD),
 981             ('hEvent', ctypes.wintypes.HANDLE),
 982         ]
 983
 984     kernel32 = ctypes.windll.kernel32
 985     LockFileEx = kernel32.LockFileEx
 986     LockFileEx.argtypes = [
 987         ctypes.wintypes.HANDLE,     # hFile
 988         ctypes.wintypes.DWORD,      # dwFlags
 989         ctypes.wintypes.DWORD,      # dwReserved
 990         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 991         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 992         ctypes.POINTER(OVERLAPPED)  # Overlapped
 993     ]
 994     LockFileEx.restype = ctypes.wintypes.BOOL
 995     UnlockFileEx = kernel32.UnlockFileEx
 996     UnlockFileEx.argtypes = [
 997         ctypes.wintypes.HANDLE,     # hFile
 998         ctypes.wintypes.DWORD,      # dwReserved
 999         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1000         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1001         ctypes.POINTER(OVERLAPPED)  # Overlapped
1002     ]
1003     UnlockFileEx.restype = ctypes.wintypes.BOOL
1004     whole_low = 0xffffffff
1005     whole_high = 0x7fffffff
1006
1007     def _lock_file(f, exclusive):
1008         overlapped = OVERLAPPED()
1009         overlapped.Offset = 0
1010         overlapped.OffsetHigh = 0
1011         overlapped.hEvent = 0
1012         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1013         handle = msvcrt.get_osfhandle(f.fileno())
1014         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1015                           whole_low, whole_high, f._lock_file_overlapped_p):
1016             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1017
1018     def _unlock_file(f):
1019         assert f._lock_file_overlapped_p
1020         handle = msvcrt.get_osfhandle(f.fileno())
1021         if not UnlockFileEx(handle, 0,
1022                             whole_low, whole_high, f._lock_file_overlapped_p):
1023             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1024
1025 else:
1026     import fcntl
1027
1028     def _lock_file(f, exclusive):
1029         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1030
1031     def _unlock_file(f):
1032         fcntl.flock(f, fcntl.LOCK_UN)
1033
1034
1035 class locked_file(object):
1036     def __init__(self, filename, mode, encoding=None):
1037         assert mode in ['r', 'a', 'w']
1038         self.f = io.open(filename, mode, encoding=encoding)
1039         self.mode = mode
1040
1041     def __enter__(self):
1042         exclusive = self.mode != 'r'
1043         try:
1044             _lock_file(self.f, exclusive)
1045         except IOError:
1046             self.f.close()
1047             raise
1048         return self
1049
1050     def __exit__(self, etype, value, traceback):
1051         try:
1052             _unlock_file(self.f)
1053         finally:
1054             self.f.close()
1055
1056     def __iter__(self):
1057         return iter(self.f)
1058
1059     def write(self, *args):
1060         return self.f.write(*args)
1061
1062     def read(self, *args):
1063         return self.f.read(*args)
1064
1065
1066 def get_filesystem_encoding():
1067     encoding = sys.getfilesystemencoding()
1068     return encoding if encoding is not None else 'utf-8'
1069
1070
1071 def shell_quote(args):
1072     quoted_args = []
1073     encoding = get_filesystem_encoding()
1074     for a in args:
1075         if isinstance(a, bytes):
1076             # We may get a filename encoded with 'encodeFilename'
1077             a = a.decode(encoding)
1078         quoted_args.append(pipes.quote(a))
1079     return ' '.join(quoted_args)
1080
1081
1082 def takewhile_inclusive(pred, seq):
1083     """ Like itertools.takewhile, but include the latest evaluated element
1084         (the first element so that Not pred(e)) """
1085     for e in seq:
1086         yield e
1087         if not pred(e):
1088             return
1089
1090
1091 def smuggle_url(url, data):
1092     """ Pass additional data in a URL for internal use. """
1093
1094     sdata = compat_urllib_parse.urlencode(
1095         {'__youtubedl_smuggle': json.dumps(data)})
1096     return url + '#' + sdata
1097
1098
1099 def unsmuggle_url(smug_url, default=None):
1100     if '#__youtubedl_smuggle' not in smug_url:
1101         return smug_url, default
1102     url, _, sdata = smug_url.rpartition('#')
1103     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1104     data = json.loads(jsond)
1105     return url, data
1106
1107
1108 def format_bytes(bytes):
1109     if bytes is None:
1110         return 'N/A'
1111     if type(bytes) is str:
1112         bytes = float(bytes)
1113     if bytes == 0.0:
1114         exponent = 0
1115     else:
1116         exponent = int(math.log(bytes, 1024.0))
1117     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1118     converted = float(bytes) / float(1024 ** exponent)
1119     return '%.2f%s' % (converted, suffix)
1120
1121
1122 def parse_filesize(s):
1123     if s is None:
1124         return None
1125
1126     # The lower-case forms are of course incorrect and inofficial,
1127     # but we support those too
1128     _UNIT_TABLE = {
1129         'B': 1,
1130         'b': 1,
1131         'KiB': 1024,
1132         'KB': 1000,
1133         'kB': 1024,
1134         'Kb': 1000,
1135         'MiB': 1024 ** 2,
1136         'MB': 1000 ** 2,
1137         'mB': 1024 ** 2,
1138         'Mb': 1000 ** 2,
1139         'GiB': 1024 ** 3,
1140         'GB': 1000 ** 3,
1141         'gB': 1024 ** 3,
1142         'Gb': 1000 ** 3,
1143         'TiB': 1024 ** 4,
1144         'TB': 1000 ** 4,
1145         'tB': 1024 ** 4,
1146         'Tb': 1000 ** 4,
1147         'PiB': 1024 ** 5,
1148         'PB': 1000 ** 5,
1149         'pB': 1024 ** 5,
1150         'Pb': 1000 ** 5,
1151         'EiB': 1024 ** 6,
1152         'EB': 1000 ** 6,
1153         'eB': 1024 ** 6,
1154         'Eb': 1000 ** 6,
1155         'ZiB': 1024 ** 7,
1156         'ZB': 1000 ** 7,
1157         'zB': 1024 ** 7,
1158         'Zb': 1000 ** 7,
1159         'YiB': 1024 ** 8,
1160         'YB': 1000 ** 8,
1161         'yB': 1024 ** 8,
1162         'Yb': 1000 ** 8,
1163     }
1164
1165     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1166     m = re.match(
1167         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1168     if not m:
1169         return None
1170
1171     num_str = m.group('num').replace(',', '.')
1172     mult = _UNIT_TABLE[m.group('unit')]
1173     return int(float(num_str) * mult)
1174
1175
1176 def month_by_name(name):
1177     """ Return the number of a month by (locale-independently) English name """
1178
1179     try:
1180         return ENGLISH_MONTH_NAMES.index(name) + 1
1181     except ValueError:
1182         return None
1183
1184
1185 def month_by_abbreviation(abbrev):
1186     """ Return the number of a month by (locale-independently) English
1187         abbreviations """
1188
1189     try:
1190         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1191     except ValueError:
1192         return None
1193
1194
1195 def fix_xml_ampersands(xml_str):
1196     """Replace all the '&' by '&amp;' in XML"""
1197     return re.sub(
1198         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1199         '&amp;',
1200         xml_str)
1201
1202
1203 def setproctitle(title):
1204     assert isinstance(title, compat_str)
1205     try:
1206         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1207     except OSError:
1208         return
1209     title_bytes = title.encode('utf-8')
1210     buf = ctypes.create_string_buffer(len(title_bytes))
1211     buf.value = title_bytes
1212     try:
1213         libc.prctl(15, buf, 0, 0, 0)
1214     except AttributeError:
1215         return  # Strange libc, just skip this
1216
1217
1218 def remove_start(s, start):
1219     if s.startswith(start):
1220         return s[len(start):]
1221     return s
1222
1223
1224 def remove_end(s, end):
1225     if s.endswith(end):
1226         return s[:-len(end)]
1227     return s
1228
1229
1230 def url_basename(url):
1231     path = compat_urlparse.urlparse(url).path
1232     return path.strip('/').split('/')[-1]
1233
1234
1235 class HEADRequest(compat_urllib_request.Request):
1236     def get_method(self):
1237         return "HEAD"
1238
1239
1240 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1241     if get_attr:
1242         if v is not None:
1243             v = getattr(v, get_attr, None)
1244     if v == '':
1245         v = None
1246     return default if v is None else (int(v) * invscale // scale)
1247
1248
1249 def str_or_none(v, default=None):
1250     return default if v is None else compat_str(v)
1251
1252
1253 def str_to_int(int_str):
1254     """ A more relaxed version of int_or_none """
1255     if int_str is None:
1256         return None
1257     int_str = re.sub(r'[,\.\+]', '', int_str)
1258     return int(int_str)
1259
1260
1261 def float_or_none(v, scale=1, invscale=1, default=None):
1262     return default if v is None else (float(v) * invscale / scale)
1263
1264
1265 def parse_duration(s):
1266     if not isinstance(s, compat_basestring):
1267         return None
1268
1269     s = s.strip()
1270
1271     m = re.match(
1272         r'''(?ix)(?:P?T)?
1273         (?:
1274             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1275             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1276
1277             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1278             (?:
1279                 (?:
1280                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1281                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1282                 )?
1283                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1284             )?
1285             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1286         )$''', s)
1287     if not m:
1288         return None
1289     res = 0
1290     if m.group('only_mins'):
1291         return float_or_none(m.group('only_mins'), invscale=60)
1292     if m.group('only_hours'):
1293         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1294     if m.group('secs'):
1295         res += int(m.group('secs'))
1296     if m.group('mins_reversed'):
1297         res += int(m.group('mins_reversed')) * 60
1298     if m.group('mins'):
1299         res += int(m.group('mins')) * 60
1300     if m.group('hours'):
1301         res += int(m.group('hours')) * 60 * 60
1302     if m.group('hours_reversed'):
1303         res += int(m.group('hours_reversed')) * 60 * 60
1304     if m.group('days'):
1305         res += int(m.group('days')) * 24 * 60 * 60
1306     if m.group('ms'):
1307         res += float(m.group('ms'))
1308     return res
1309
1310
1311 def prepend_extension(filename, ext):
1312     name, real_ext = os.path.splitext(filename)
1313     return '{0}.{1}{2}'.format(name, ext, real_ext)
1314
1315
1316 def check_executable(exe, args=[]):
1317     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1318     args can be a list of arguments for a short output (like -version) """
1319     try:
1320         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1321     except OSError:
1322         return False
1323     return exe
1324
1325
1326 def get_exe_version(exe, args=['--version'],
1327                     version_re=None, unrecognized='present'):
1328     """ Returns the version of the specified executable,
1329     or False if the executable is not present """
1330     try:
1331         out, _ = subprocess.Popen(
1332             [exe] + args,
1333             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1334     except OSError:
1335         return False
1336     if isinstance(out, bytes):  # Python 2.x
1337         out = out.decode('ascii', 'ignore')
1338     return detect_exe_version(out, version_re, unrecognized)
1339
1340
1341 def detect_exe_version(output, version_re=None, unrecognized='present'):
1342     assert isinstance(output, compat_str)
1343     if version_re is None:
1344         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1345     m = re.search(version_re, output)
1346     if m:
1347         return m.group(1)
1348     else:
1349         return unrecognized
1350
1351
1352 class PagedList(object):
1353     def __len__(self):
1354         # This is only useful for tests
1355         return len(self.getslice())
1356
1357
1358 class OnDemandPagedList(PagedList):
1359     def __init__(self, pagefunc, pagesize):
1360         self._pagefunc = pagefunc
1361         self._pagesize = pagesize
1362
1363     def getslice(self, start=0, end=None):
1364         res = []
1365         for pagenum in itertools.count(start // self._pagesize):
1366             firstid = pagenum * self._pagesize
1367             nextfirstid = pagenum * self._pagesize + self._pagesize
1368             if start >= nextfirstid:
1369                 continue
1370
1371             page_results = list(self._pagefunc(pagenum))
1372
1373             startv = (
1374                 start % self._pagesize
1375                 if firstid <= start < nextfirstid
1376                 else 0)
1377
1378             endv = (
1379                 ((end - 1) % self._pagesize) + 1
1380                 if (end is not None and firstid <= end <= nextfirstid)
1381                 else None)
1382
1383             if startv != 0 or endv is not None:
1384                 page_results = page_results[startv:endv]
1385             res.extend(page_results)
1386
1387             # A little optimization - if current page is not "full", ie. does
1388             # not contain page_size videos then we can assume that this page
1389             # is the last one - there are no more ids on further pages -
1390             # i.e. no need to query again.
1391             if len(page_results) + startv < self._pagesize:
1392                 break
1393
1394             # If we got the whole page, but the next page is not interesting,
1395             # break out early as well
1396             if end == nextfirstid:
1397                 break
1398         return res
1399
1400
1401 class InAdvancePagedList(PagedList):
1402     def __init__(self, pagefunc, pagecount, pagesize):
1403         self._pagefunc = pagefunc
1404         self._pagecount = pagecount
1405         self._pagesize = pagesize
1406
1407     def getslice(self, start=0, end=None):
1408         res = []
1409         start_page = start // self._pagesize
1410         end_page = (
1411             self._pagecount if end is None else (end // self._pagesize + 1))
1412         skip_elems = start - start_page * self._pagesize
1413         only_more = None if end is None else end - start
1414         for pagenum in range(start_page, end_page):
1415             page = list(self._pagefunc(pagenum))
1416             if skip_elems:
1417                 page = page[skip_elems:]
1418                 skip_elems = None
1419             if only_more is not None:
1420                 if len(page) < only_more:
1421                     only_more -= len(page)
1422                 else:
1423                     page = page[:only_more]
1424                     res.extend(page)
1425                     break
1426             res.extend(page)
1427         return res
1428
1429
1430 def uppercase_escape(s):
1431     unicode_escape = codecs.getdecoder('unicode_escape')
1432     return re.sub(
1433         r'\\U[0-9a-fA-F]{8}',
1434         lambda m: unicode_escape(m.group(0))[0],
1435         s)
1436
1437
1438 def escape_rfc3986(s):
1439     """Escape non-ASCII characters as suggested by RFC 3986"""
1440     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1441         s = s.encode('utf-8')
1442     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1443
1444
1445 def escape_url(url):
1446     """Escape URL as suggested by RFC 3986"""
1447     url_parsed = compat_urllib_parse_urlparse(url)
1448     return url_parsed._replace(
1449         path=escape_rfc3986(url_parsed.path),
1450         params=escape_rfc3986(url_parsed.params),
1451         query=escape_rfc3986(url_parsed.query),
1452         fragment=escape_rfc3986(url_parsed.fragment)
1453     ).geturl()
1454
1455 try:
1456     struct.pack('!I', 0)
1457 except TypeError:
1458     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1459     def struct_pack(spec, *args):
1460         if isinstance(spec, compat_str):
1461             spec = spec.encode('ascii')
1462         return struct.pack(spec, *args)
1463
1464     def struct_unpack(spec, *args):
1465         if isinstance(spec, compat_str):
1466             spec = spec.encode('ascii')
1467         return struct.unpack(spec, *args)
1468 else:
1469     struct_pack = struct.pack
1470     struct_unpack = struct.unpack
1471
1472
1473 def read_batch_urls(batch_fd):
1474     def fixup(url):
1475         if not isinstance(url, compat_str):
1476             url = url.decode('utf-8', 'replace')
1477         BOM_UTF8 = '\xef\xbb\xbf'
1478         if url.startswith(BOM_UTF8):
1479             url = url[len(BOM_UTF8):]
1480         url = url.strip()
1481         if url.startswith(('#', ';', ']')):
1482             return False
1483         return url
1484
1485     with contextlib.closing(batch_fd) as fd:
1486         return [url for url in map(fixup, fd) if url]
1487
1488
1489 def urlencode_postdata(*args, **kargs):
1490     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1491
1492
1493 try:
1494     etree_iter = xml.etree.ElementTree.Element.iter
1495 except AttributeError:  # Python <=2.6
1496     etree_iter = lambda n: n.findall('.//*')
1497
1498
1499 def parse_xml(s):
1500     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1501         def doctype(self, name, pubid, system):
1502             pass  # Ignore doctypes
1503
1504     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1505     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1506     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1507     # Fix up XML parser in Python 2.x
1508     if sys.version_info < (3, 0):
1509         for n in etree_iter(tree):
1510             if n.text is not None:
1511                 if not isinstance(n.text, compat_str):
1512                     n.text = n.text.decode('utf-8')
1513     return tree
1514
1515
1516 US_RATINGS = {
1517     'G': 0,
1518     'PG': 10,
1519     'PG-13': 13,
1520     'R': 16,
1521     'NC': 18,
1522 }
1523
1524
1525 def parse_age_limit(s):
1526     if s is None:
1527         return None
1528     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1529     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1530
1531
1532 def strip_jsonp(code):
1533     return re.sub(
1534         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1535
1536
1537 def js_to_json(code):
1538     def fix_kv(m):
1539         v = m.group(0)
1540         if v in ('true', 'false', 'null'):
1541             return v
1542         if v.startswith('"'):
1543             return v
1544         if v.startswith("'"):
1545             v = v[1:-1]
1546             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1547                 '\\\\': '\\\\',
1548                 "\\'": "'",
1549                 '"': '\\"',
1550             }[m.group(0)], v)
1551         return '"%s"' % v
1552
1553     res = re.sub(r'''(?x)
1554         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1555         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1556         [a-zA-Z_][.a-zA-Z_0-9]*
1557         ''', fix_kv, code)
1558     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1559     return res
1560
1561
1562 def qualities(quality_ids):
1563     """ Get a numeric quality value out of a list of possible values """
1564     def q(qid):
1565         try:
1566             return quality_ids.index(qid)
1567         except ValueError:
1568             return -1
1569     return q
1570
1571
1572 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1573
1574
1575 def limit_length(s, length):
1576     """ Add ellipses to overly long strings """
1577     if s is None:
1578         return None
1579     ELLIPSES = '...'
1580     if len(s) > length:
1581         return s[:length - len(ELLIPSES)] + ELLIPSES
1582     return s
1583
1584
1585 def version_tuple(v):
1586     return tuple(int(e) for e in re.split(r'[-.]', v))
1587
1588
1589 def is_outdated_version(version, limit, assume_new=True):
1590     if not version:
1591         return not assume_new
1592     try:
1593         return version_tuple(version) < version_tuple(limit)
1594     except ValueError:
1595         return not assume_new
1596
1597
1598 def ytdl_is_updateable():
1599     """ Returns if youtube-dl can be updated with -U """
1600     from zipimport import zipimporter
1601
1602     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1603
1604
1605 def args_to_str(args):
1606     # Get a short string representation for a subprocess command
1607     return ' '.join(shlex_quote(a) for a in args)
1608
1609
1610 def mimetype2ext(mt):
1611     _, _, res = mt.rpartition('/')
1612
1613     return {
1614         'x-ms-wmv': 'wmv',
1615         'x-mp4-fragmented': 'mp4',
1616     }.get(res, res)
1617
1618
1619 def urlhandle_detect_ext(url_handle):
1620     try:
1621         url_handle.headers
1622         getheader = lambda h: url_handle.headers[h]
1623     except AttributeError:  # Python < 3
1624         getheader = url_handle.info().getheader
1625
1626     cd = getheader('Content-Disposition')
1627     if cd:
1628         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1629         if m:
1630             e = determine_ext(m.group('filename'), default_ext=None)
1631             if e:
1632                 return e
1633
1634     return mimetype2ext(getheader('Content-Type'))
1635
1636
1637 def age_restricted(content_limit, age_limit):
1638     """ Returns True iff the content should be blocked """
1639
1640     if age_limit is None:  # No limit set
1641         return False
1642     if content_limit is None:
1643         return False  # Content available for everyone
1644     return age_limit < content_limit
1645
1646
1647 def is_html(first_bytes):
1648     """ Detect whether a file contains HTML by examining its first bytes. """
1649
1650     BOMS = [
1651         (b'\xef\xbb\xbf', 'utf-8'),
1652         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1653         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1654         (b'\xff\xfe', 'utf-16-le'),
1655         (b'\xfe\xff', 'utf-16-be'),
1656     ]
1657     for bom, enc in BOMS:
1658         if first_bytes.startswith(bom):
1659             s = first_bytes[len(bom):].decode(enc, 'replace')
1660             break
1661     else:
1662         s = first_bytes.decode('utf-8', 'replace')
1663
1664     return re.match(r'^\s*<', s)
1665
1666
1667 def determine_protocol(info_dict):
1668     protocol = info_dict.get('protocol')
1669     if protocol is not None:
1670         return protocol
1671
1672     url = info_dict['url']
1673     if url.startswith('rtmp'):
1674         return 'rtmp'
1675     elif url.startswith('mms'):
1676         return 'mms'
1677     elif url.startswith('rtsp'):
1678         return 'rtsp'
1679
1680     ext = determine_ext(url)
1681     if ext == 'm3u8':
1682         return 'm3u8'
1683     elif ext == 'f4m':
1684         return 'f4m'
1685
1686     return compat_urllib_parse_urlparse(url).scheme
1687
1688
1689 def render_table(header_row, data):
1690     """ Render a list of rows, each as a list of values """
1691     table = [header_row] + data
1692     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1693     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1694     return '\n'.join(format_str % tuple(row) for row in table)
1695
1696
1697 def _match_one(filter_part, dct):
1698     COMPARISON_OPERATORS = {
1699         '<': operator.lt,
1700         '<=': operator.le,
1701         '>': operator.gt,
1702         '>=': operator.ge,
1703         '=': operator.eq,
1704         '!=': operator.ne,
1705     }
1706     operator_rex = re.compile(r'''(?x)\s*
1707         (?P<key>[a-z_]+)
1708         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1709         (?:
1710             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1711             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1712         )
1713         \s*$
1714         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1715     m = operator_rex.search(filter_part)
1716     if m:
1717         op = COMPARISON_OPERATORS[m.group('op')]
1718         if m.group('strval') is not None:
1719             if m.group('op') not in ('=', '!='):
1720                 raise ValueError(
1721                     'Operator %s does not support string values!' % m.group('op'))
1722             comparison_value = m.group('strval')
1723         else:
1724             try:
1725                 comparison_value = int(m.group('intval'))
1726             except ValueError:
1727                 comparison_value = parse_filesize(m.group('intval'))
1728                 if comparison_value is None:
1729                     comparison_value = parse_filesize(m.group('intval') + 'B')
1730                 if comparison_value is None:
1731                     raise ValueError(
1732                         'Invalid integer value %r in filter part %r' % (
1733                             m.group('intval'), filter_part))
1734         actual_value = dct.get(m.group('key'))
1735         if actual_value is None:
1736             return m.group('none_inclusive')
1737         return op(actual_value, comparison_value)
1738
1739     UNARY_OPERATORS = {
1740         '': lambda v: v is not None,
1741         '!': lambda v: v is None,
1742     }
1743     operator_rex = re.compile(r'''(?x)\s*
1744         (?P<op>%s)\s*(?P<key>[a-z_]+)
1745         \s*$
1746         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1747     m = operator_rex.search(filter_part)
1748     if m:
1749         op = UNARY_OPERATORS[m.group('op')]
1750         actual_value = dct.get(m.group('key'))
1751         return op(actual_value)
1752
1753     raise ValueError('Invalid filter part %r' % filter_part)
1754
1755
1756 def match_str(filter_str, dct):
1757     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1758
1759     return all(
1760         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1761
1762
1763 def match_filter_func(filter_str):
1764     def _match_func(info_dict):
1765         if match_str(filter_str, info_dict):
1766             return None
1767         else:
1768             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1769             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1770     return _match_func
1771
1772
1773 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1774     def __init__(self, proxies=None):
1775         # Set default handlers
1776         for type in ('http', 'https'):
1777             setattr(self, '%s_open' % type,
1778                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1779                         meth(r, proxy, type))
1780         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1781
1782     def proxy_open(self, req, proxy, type):
1783         req_proxy = req.headers.get('Ytdl-request-proxy')
1784         if req_proxy is not None:
1785             proxy = req_proxy
1786             del req.headers['Ytdl-request-proxy']
1787
1788         if proxy == '__noproxy__':
1789             return None  # No Proxy
1790         return compat_urllib_request.ProxyHandler.proxy_open(
1791             self, req, proxy, type)
1792
1793
1794 def url_sanitize_consecutive_slashes(url):
1795     """Sanitize URLs with consecutive slashes
1796
1797     For example, transform both
1798         http://hostname/foo//bar/filename.html
1799     and
1800         http://hostname//foo/bar/filename.html
1801     into
1802         http://hostname/foo/bar/filename.html
1803     """
1804     parsed_url = list(compat_urlparse.urlparse(url))
1805     parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
1806     return compat_urlparse.urlunparse(parsed_url)