youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None or n.text is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208     # Newline vs <br />
 209     html = html.replace('\n', ' ')
 210     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 211     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 212     # Strip html tags
 213     html = re.sub('<.*?>', '', html)
 214     # Replace html entities
 215     html = unescapeHTML(html)
 216     return html.strip()
 217
 218
 219 def sanitize_open(filename, open_mode):
 220     """Try to open the given filename, and slightly tweak it if this fails.
 221
 222     Attempts to open the given filename. If this fails, it tries to change
 223     the filename slightly, step by step, until it's either able to open it
 224     or it fails and raises a final exception, like the standard open()
 225     function.
 226
 227     It returns the tuple (stream, definitive_file_name).
 228     """
 229     try:
 230         if filename == '-':
 231             if sys.platform == 'win32':
 232                 import msvcrt
 233                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 234             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 235         stream = open(encodeFilename(filename), open_mode)
 236         return (stream, filename)
 237     except (IOError, OSError) as err:
 238         if err.errno in (errno.EACCES,):
 239             raise
 240
 241         # In case of error, try to remove win32 forbidden chars
 242         alt_filename = os.path.join(
 243             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 244             for path_part in os.path.split(filename)
 245         )
 246         if alt_filename == filename:
 247             raise
 248         else:
 249             # An exception here should be caught in the caller
 250             stream = open(encodeFilename(filename), open_mode)
 251             return (stream, alt_filename)
 252
 253
 254 def timeconvert(timestr):
 255     """Convert RFC 2822 defined time string into system timestamp"""
 256     timestamp = None
 257     timetuple = email.utils.parsedate_tz(timestr)
 258     if timetuple is not None:
 259         timestamp = email.utils.mktime_tz(timetuple)
 260     return timestamp
 261
 262
 263 def sanitize_filename(s, restricted=False, is_id=False):
 264     """Sanitizes a string so it could be used as part of a filename.
 265     If restricted is set, use a stricter subset of allowed characters.
 266     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 267     """
 268     def replace_insane(char):
 269         if char == '?' or ord(char) < 32 or ord(char) == 127:
 270             return ''
 271         elif char == '"':
 272             return '' if restricted else '\''
 273         elif char == ':':
 274             return '_-' if restricted else ' -'
 275         elif char in '\\/|*<>':
 276             return '_'
 277         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 278             return '_'
 279         if restricted and ord(char) > 127:
 280             return '_'
 281         return char
 282
 283     result = ''.join(map(replace_insane, s))
 284     if not is_id:
 285         while '__' in result:
 286             result = result.replace('__', '_')
 287         result = result.strip('_')
 288         # Common case of "Foreign band name - English song title"
 289         if restricted and result.startswith('-_'):
 290             result = result[2:]
 291         if not result:
 292             result = '_'
 293     return result
 294
 295
 296 def orderedSet(iterable):
 297     """ Remove all duplicates from the input iterable """
 298     res = []
 299     for el in iterable:
 300         if el not in res:
 301             res.append(el)
 302     return res
 303
 304
 305 def _htmlentity_transform(entity):
 306     """Transforms an HTML entity to a character."""
 307     # Known non-numeric HTML entity
 308     if entity in compat_html_entities.name2codepoint:
 309         return compat_chr(compat_html_entities.name2codepoint[entity])
 310
 311     mobj = re.match(r'#(x?[0-9]+)', entity)
 312     if mobj is not None:
 313         numstr = mobj.group(1)
 314         if numstr.startswith('x'):
 315             base = 16
 316             numstr = '0%s' % numstr
 317         else:
 318             base = 10
 319         return compat_chr(int(numstr, base))
 320
 321     # Unknown entity in name, return its literal representation
 322     return ('&%s;' % entity)
 323
 324
 325 def unescapeHTML(s):
 326     if s is None:
 327         return None
 328     assert type(s) == compat_str
 329
 330     return re.sub(
 331         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 332
 333
 334 def encodeFilename(s, for_subprocess=False):
 335     """
 336     @param s The name of the file
 337     """
 338
 339     assert type(s) == compat_str
 340
 341     # Python 3 has a Unicode API
 342     if sys.version_info >= (3, 0):
 343         return s
 344
 345     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 346         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 347         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 348         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 349         if not for_subprocess:
 350             return s
 351         else:
 352             # For subprocess calls, encode with locale encoding
 353             # Refer to http://stackoverflow.com/a/9951851/35070
 354             encoding = preferredencoding()
 355     else:
 356         encoding = sys.getfilesystemencoding()
 357     if encoding is None:
 358         encoding = 'utf-8'
 359     return s.encode(encoding, 'ignore')
 360
 361
 362 def encodeArgument(s):
 363     if not isinstance(s, compat_str):
 364         # Legacy code that uses byte strings
 365         # Uncomment the following line after fixing all post processors
 366         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 367         s = s.decode('ascii')
 368     return encodeFilename(s, True)
 369
 370
 371 def decodeOption(optval):
 372     if optval is None:
 373         return optval
 374     if isinstance(optval, bytes):
 375         optval = optval.decode(preferredencoding())
 376
 377     assert isinstance(optval, compat_str)
 378     return optval
 379
 380
 381 def formatSeconds(secs):
 382     if secs > 3600:
 383         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 384     elif secs > 60:
 385         return '%d:%02d' % (secs // 60, secs % 60)
 386     else:
 387         return '%d' % secs
 388
 389
 390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 391     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 392         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 393         if opts_no_check_certificate:
 394             context.verify_mode = ssl.CERT_NONE
 395         try:
 396             return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 397         except TypeError:
 398             # Python 2.7.8
 399             # (create_default_context present but HTTPSHandler has no context=)
 400             pass
 401
 402     if sys.version_info < (3, 2):
 403         import httplib
 404
 405         class HTTPSConnectionV3(httplib.HTTPSConnection):
 406             def __init__(self, *args, **kwargs):
 407                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 408
 409             def connect(self):
 410                 sock = socket.create_connection((self.host, self.port), self.timeout)
 411                 if getattr(self, '_tunnel_host', False):
 412                     self.sock = sock
 413                     self._tunnel()
 414                 try:
 415                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 416                 except ssl.SSLError:
 417                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 418
 419         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 420             def https_open(self, req):
 421                 return self.do_open(HTTPSConnectionV3, req)
 422         return HTTPSHandlerV3(**kwargs)
 423     else:  # Python < 3.4
 424         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 425         context.verify_mode = (ssl.CERT_NONE
 426                                if opts_no_check_certificate
 427                                else ssl.CERT_REQUIRED)
 428         context.set_default_verify_paths()
 429         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 430
 431
 432 class ExtractorError(Exception):
 433     """Error during info extraction."""
 434
 435     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 436         """ tb, if given, is the original traceback (so that it can be printed out).
 437         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 438         """
 439
 440         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 441             expected = True
 442         if video_id is not None:
 443             msg = video_id + ': ' + msg
 444         if cause:
 445             msg += ' (caused by %r)' % cause
 446         if not expected:
 447             if ytdl_is_updateable():
 448                 update_cmd = 'type  youtube-dl -U  to update'
 449             else:
 450                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 451             msg += '; please report this issue on https://yt-dl.org/bug .'
 452             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 453             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 454         super(ExtractorError, self).__init__(msg)
 455
 456         self.traceback = tb
 457         self.exc_info = sys.exc_info()  # preserve original exception
 458         self.cause = cause
 459         self.video_id = video_id
 460
 461     def format_traceback(self):
 462         if self.traceback is None:
 463             return None
 464         return ''.join(traceback.format_tb(self.traceback))
 465
 466
 467 class UnsupportedError(ExtractorError):
 468     def __init__(self, url):
 469         super(UnsupportedError, self).__init__(
 470             'Unsupported URL: %s' % url, expected=True)
 471         self.url = url
 472
 473
 474 class RegexNotFoundError(ExtractorError):
 475     """Error when a regex didn't match"""
 476     pass
 477
 478
 479 class DownloadError(Exception):
 480     """Download Error exception.
 481
 482     This exception may be thrown by FileDownloader objects if they are not
 483     configured to continue on errors. They will contain the appropriate
 484     error message.
 485     """
 486
 487     def __init__(self, msg, exc_info=None):
 488         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 489         super(DownloadError, self).__init__(msg)
 490         self.exc_info = exc_info
 491
 492
 493 class SameFileError(Exception):
 494     """Same File exception.
 495
 496     This exception will be thrown by FileDownloader objects if they detect
 497     multiple files would have to be downloaded to the same file on disk.
 498     """
 499     pass
 500
 501
 502 class PostProcessingError(Exception):
 503     """Post Processing exception.
 504
 505     This exception may be raised by PostProcessor's .run() method to
 506     indicate an error in the postprocessing task.
 507     """
 508
 509     def __init__(self, msg):
 510         self.msg = msg
 511
 512
 513 class MaxDownloadsReached(Exception):
 514     """ --max-downloads limit has been reached. """
 515     pass
 516
 517
 518 class UnavailableVideoError(Exception):
 519     """Unavailable Format exception.
 520
 521     This exception will be thrown when a video is requested
 522     in a format that is not available for that video.
 523     """
 524     pass
 525
 526
 527 class ContentTooShortError(Exception):
 528     """Content Too Short exception.
 529
 530     This exception may be raised by FileDownloader objects when a file they
 531     download is too small for what the server announced first, indicating
 532     the connection was probably interrupted.
 533     """
 534     # Both in bytes
 535     downloaded = None
 536     expected = None
 537
 538     def __init__(self, downloaded, expected):
 539         self.downloaded = downloaded
 540         self.expected = expected
 541
 542
 543 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 544     """Handler for HTTP requests and responses.
 545
 546     This class, when installed with an OpenerDirector, automatically adds
 547     the standard headers to every HTTP request and handles gzipped and
 548     deflated responses from web servers. If compression is to be avoided in
 549     a particular request, the original request in the program code only has
 550     to include the HTTP header "Youtubedl-No-Compression", which will be
 551     removed before making the real request.
 552
 553     Part of this code was copied from:
 554
 555     http://techknack.net/python-urllib2-handlers/
 556
 557     Andrew Rowls, the author of that code, agreed to release it to the
 558     public domain.
 559     """
 560
 561     @staticmethod
 562     def deflate(data):
 563         try:
 564             return zlib.decompress(data, -zlib.MAX_WBITS)
 565         except zlib.error:
 566             return zlib.decompress(data)
 567
 568     @staticmethod
 569     def addinfourl_wrapper(stream, headers, url, code):
 570         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 571             return compat_urllib_request.addinfourl(stream, headers, url, code)
 572         ret = compat_urllib_request.addinfourl(stream, headers, url)
 573         ret.code = code
 574         return ret
 575
 576     def http_request(self, req):
 577         for h, v in std_headers.items():
 578             if h not in req.headers:
 579                 req.add_header(h, v)
 580         if 'Youtubedl-no-compression' in req.headers:
 581             if 'Accept-encoding' in req.headers:
 582                 del req.headers['Accept-encoding']
 583             del req.headers['Youtubedl-no-compression']
 584         if 'Youtubedl-user-agent' in req.headers:
 585             if 'User-agent' in req.headers:
 586                 del req.headers['User-agent']
 587             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 588             del req.headers['Youtubedl-user-agent']
 589
 590         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 591             # Python 2.6 is brain-dead when it comes to fragments
 592             req._Request__original = req._Request__original.partition('#')[0]
 593             req._Request__r_type = req._Request__r_type.partition('#')[0]
 594
 595         return req
 596
 597     def http_response(self, req, resp):
 598         old_resp = resp
 599         # gzip
 600         if resp.headers.get('Content-encoding', '') == 'gzip':
 601             content = resp.read()
 602             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 603             try:
 604                 uncompressed = io.BytesIO(gz.read())
 605             except IOError as original_ioerror:
 606                 # There may be junk add the end of the file
 607                 # See http://stackoverflow.com/q/4928560/35070 for details
 608                 for i in range(1, 1024):
 609                     try:
 610                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 611                         uncompressed = io.BytesIO(gz.read())
 612                     except IOError:
 613                         continue
 614                     break
 615                 else:
 616                     raise original_ioerror
 617             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 618             resp.msg = old_resp.msg
 619         # deflate
 620         if resp.headers.get('Content-encoding', '') == 'deflate':
 621             gz = io.BytesIO(self.deflate(resp.read()))
 622             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 623             resp.msg = old_resp.msg
 624         return resp
 625
 626     https_request = http_request
 627     https_response = http_response
 628
 629
 630 def parse_iso8601(date_str, delimiter='T'):
 631     """ Return a UNIX timestamp from the given date """
 632
 633     if date_str is None:
 634         return None
 635
 636     m = re.search(
 637         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 638         date_str)
 639     if not m:
 640         timezone = datetime.timedelta()
 641     else:
 642         date_str = date_str[:-len(m.group(0))]
 643         if not m.group('sign'):
 644             timezone = datetime.timedelta()
 645         else:
 646             sign = 1 if m.group('sign') == '+' else -1
 647             timezone = datetime.timedelta(
 648                 hours=sign * int(m.group('hours')),
 649                 minutes=sign * int(m.group('minutes')))
 650     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 651     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 652     return calendar.timegm(dt.timetuple())
 653
 654
 655 def unified_strdate(date_str, day_first=True):
 656     """Return a string with the date in the format YYYYMMDD"""
 657
 658     if date_str is None:
 659         return None
 660     upload_date = None
 661     # Replace commas
 662     date_str = date_str.replace(',', ' ')
 663     # %z (UTC offset) is only supported in python>=3.2
 664     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 665     # Remove AM/PM + timezone
 666     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 667
 668     format_expressions = [
 669         '%d %B %Y',
 670         '%d %b %Y',
 671         '%B %d %Y',
 672         '%b %d %Y',
 673         '%b %dst %Y %I:%M%p',
 674         '%b %dnd %Y %I:%M%p',
 675         '%b %dth %Y %I:%M%p',
 676         '%Y-%m-%d',
 677         '%Y/%m/%d',
 678         '%d.%m.%Y',
 679         '%d/%m/%Y',
 680         '%d/%m/%y',
 681         '%Y/%m/%d %H:%M:%S',
 682         '%Y-%m-%d %H:%M:%S',
 683         '%Y-%m-%d %H:%M:%S.%f',
 684         '%d.%m.%Y %H:%M',
 685         '%d.%m.%Y %H.%M',
 686         '%Y-%m-%dT%H:%M:%SZ',
 687         '%Y-%m-%dT%H:%M:%S.%fZ',
 688         '%Y-%m-%dT%H:%M:%S.%f0Z',
 689         '%Y-%m-%dT%H:%M:%S',
 690         '%Y-%m-%dT%H:%M:%S.%f',
 691         '%Y-%m-%dT%H:%M',
 692     ]
 693     if day_first:
 694         format_expressions.extend([
 695             '%d/%m/%Y %H:%M:%S',
 696         ])
 697     else:
 698         format_expressions.extend([
 699             '%m/%d/%Y %H:%M:%S',
 700         ])
 701     for expression in format_expressions:
 702         try:
 703             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 704         except ValueError:
 705             pass
 706     if upload_date is None:
 707         timetuple = email.utils.parsedate_tz(date_str)
 708         if timetuple:
 709             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 710     return upload_date
 711
 712
 713 def determine_ext(url, default_ext='unknown_video'):
 714     if url is None:
 715         return default_ext
 716     guess = url.partition('?')[0].rpartition('.')[2]
 717     if re.match(r'^[A-Za-z0-9]+$', guess):
 718         return guess
 719     else:
 720         return default_ext
 721
 722
 723 def subtitles_filename(filename, sub_lang, sub_format):
 724     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 725
 726
 727 def date_from_str(date_str):
 728     """
 729     Return a datetime object from a string in the format YYYYMMDD or
 730     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 731     today = datetime.date.today()
 732     if date_str in ('now', 'today'):
 733         return today
 734     if date_str == 'yesterday':
 735         return today - datetime.timedelta(days=1)
 736     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 737     if match is not None:
 738         sign = match.group('sign')
 739         time = int(match.group('time'))
 740         if sign == '-':
 741             time = -time
 742         unit = match.group('unit')
 743         # A bad aproximation?
 744         if unit == 'month':
 745             unit = 'day'
 746             time *= 30
 747         elif unit == 'year':
 748             unit = 'day'
 749             time *= 365
 750         unit += 's'
 751         delta = datetime.timedelta(**{unit: time})
 752         return today + delta
 753     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 754
 755
 756 def hyphenate_date(date_str):
 757     """
 758     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 759     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 760     if match is not None:
 761         return '-'.join(match.groups())
 762     else:
 763         return date_str
 764
 765
 766 class DateRange(object):
 767     """Represents a time interval between two dates"""
 768
 769     def __init__(self, start=None, end=None):
 770         """start and end must be strings in the format accepted by date"""
 771         if start is not None:
 772             self.start = date_from_str(start)
 773         else:
 774             self.start = datetime.datetime.min.date()
 775         if end is not None:
 776             self.end = date_from_str(end)
 777         else:
 778             self.end = datetime.datetime.max.date()
 779         if self.start > self.end:
 780             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 781
 782     @classmethod
 783     def day(cls, day):
 784         """Returns a range that only contains the given day"""
 785         return cls(day, day)
 786
 787     def __contains__(self, date):
 788         """Check if the date is in the range"""
 789         if not isinstance(date, datetime.date):
 790             date = date_from_str(date)
 791         return self.start <= date <= self.end
 792
 793     def __str__(self):
 794         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 795
 796
 797 def platform_name():
 798     """ Returns the platform name as a compat_str """
 799     res = platform.platform()
 800     if isinstance(res, bytes):
 801         res = res.decode(preferredencoding())
 802
 803     assert isinstance(res, compat_str)
 804     return res
 805
 806
 807 def _windows_write_string(s, out):
 808     """ Returns True if the string was written using special methods,
 809     False if it has yet to be written out."""
 810     # Adapted from http://stackoverflow.com/a/3259271/35070
 811
 812     import ctypes
 813     import ctypes.wintypes
 814
 815     WIN_OUTPUT_IDS = {
 816         1: -11,
 817         2: -12,
 818     }
 819
 820     try:
 821         fileno = out.fileno()
 822     except AttributeError:
 823         # If the output stream doesn't have a fileno, it's virtual
 824         return False
 825     if fileno not in WIN_OUTPUT_IDS:
 826         return False
 827
 828     GetStdHandle = ctypes.WINFUNCTYPE(
 829         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 830         (b"GetStdHandle", ctypes.windll.kernel32))
 831     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 832
 833     WriteConsoleW = ctypes.WINFUNCTYPE(
 834         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 835         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 836         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 837     written = ctypes.wintypes.DWORD(0)
 838
 839     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 840     FILE_TYPE_CHAR = 0x0002
 841     FILE_TYPE_REMOTE = 0x8000
 842     GetConsoleMode = ctypes.WINFUNCTYPE(
 843         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 844         ctypes.POINTER(ctypes.wintypes.DWORD))(
 845         (b"GetConsoleMode", ctypes.windll.kernel32))
 846     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 847
 848     def not_a_console(handle):
 849         if handle == INVALID_HANDLE_VALUE or handle is None:
 850             return True
 851         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 852                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 853
 854     if not_a_console(h):
 855         return False
 856
 857     def next_nonbmp_pos(s):
 858         try:
 859             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 860         except StopIteration:
 861             return len(s)
 862
 863     while s:
 864         count = min(next_nonbmp_pos(s), 1024)
 865
 866         ret = WriteConsoleW(
 867             h, s, count if count else 2, ctypes.byref(written), None)
 868         if ret == 0:
 869             raise OSError('Failed to write string')
 870         if not count:  # We just wrote a non-BMP character
 871             assert written.value == 2
 872             s = s[1:]
 873         else:
 874             assert written.value > 0
 875             s = s[written.value:]
 876     return True
 877
 878
 879 def write_string(s, out=None, encoding=None):
 880     if out is None:
 881         out = sys.stderr
 882     assert type(s) == compat_str
 883
 884     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 885         if _windows_write_string(s, out):
 886             return
 887
 888     if ('b' in getattr(out, 'mode', '') or
 889             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 890         byt = s.encode(encoding or preferredencoding(), 'ignore')
 891         out.write(byt)
 892     elif hasattr(out, 'buffer'):
 893         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 894         byt = s.encode(enc, 'ignore')
 895         out.buffer.write(byt)
 896     else:
 897         out.write(s)
 898     out.flush()
 899
 900
 901 def bytes_to_intlist(bs):
 902     if not bs:
 903         return []
 904     if isinstance(bs[0], int):  # Python 3
 905         return list(bs)
 906     else:
 907         return [ord(c) for c in bs]
 908
 909
 910 def intlist_to_bytes(xs):
 911     if not xs:
 912         return b''
 913     return struct_pack('%dB' % len(xs), *xs)
 914
 915
 916 # Cross-platform file locking
 917 if sys.platform == 'win32':
 918     import ctypes.wintypes
 919     import msvcrt
 920
 921     class OVERLAPPED(ctypes.Structure):
 922         _fields_ = [
 923             ('Internal', ctypes.wintypes.LPVOID),
 924             ('InternalHigh', ctypes.wintypes.LPVOID),
 925             ('Offset', ctypes.wintypes.DWORD),
 926             ('OffsetHigh', ctypes.wintypes.DWORD),
 927             ('hEvent', ctypes.wintypes.HANDLE),
 928         ]
 929
 930     kernel32 = ctypes.windll.kernel32
 931     LockFileEx = kernel32.LockFileEx
 932     LockFileEx.argtypes = [
 933         ctypes.wintypes.HANDLE,     # hFile
 934         ctypes.wintypes.DWORD,      # dwFlags
 935         ctypes.wintypes.DWORD,      # dwReserved
 936         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 937         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 938         ctypes.POINTER(OVERLAPPED)  # Overlapped
 939     ]
 940     LockFileEx.restype = ctypes.wintypes.BOOL
 941     UnlockFileEx = kernel32.UnlockFileEx
 942     UnlockFileEx.argtypes = [
 943         ctypes.wintypes.HANDLE,     # hFile
 944         ctypes.wintypes.DWORD,      # dwReserved
 945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 946         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 947         ctypes.POINTER(OVERLAPPED)  # Overlapped
 948     ]
 949     UnlockFileEx.restype = ctypes.wintypes.BOOL
 950     whole_low = 0xffffffff
 951     whole_high = 0x7fffffff
 952
 953     def _lock_file(f, exclusive):
 954         overlapped = OVERLAPPED()
 955         overlapped.Offset = 0
 956         overlapped.OffsetHigh = 0
 957         overlapped.hEvent = 0
 958         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 959         handle = msvcrt.get_osfhandle(f.fileno())
 960         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 961                           whole_low, whole_high, f._lock_file_overlapped_p):
 962             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 963
 964     def _unlock_file(f):
 965         assert f._lock_file_overlapped_p
 966         handle = msvcrt.get_osfhandle(f.fileno())
 967         if not UnlockFileEx(handle, 0,
 968                             whole_low, whole_high, f._lock_file_overlapped_p):
 969             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 970
 971 else:
 972     import fcntl
 973
 974     def _lock_file(f, exclusive):
 975         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 976
 977     def _unlock_file(f):
 978         fcntl.flock(f, fcntl.LOCK_UN)
 979
 980
 981 class locked_file(object):
 982     def __init__(self, filename, mode, encoding=None):
 983         assert mode in ['r', 'a', 'w']
 984         self.f = io.open(filename, mode, encoding=encoding)
 985         self.mode = mode
 986
 987     def __enter__(self):
 988         exclusive = self.mode != 'r'
 989         try:
 990             _lock_file(self.f, exclusive)
 991         except IOError:
 992             self.f.close()
 993             raise
 994         return self
 995
 996     def __exit__(self, etype, value, traceback):
 997         try:
 998             _unlock_file(self.f)
 999         finally:
1000             self.f.close()
1001
1002     def __iter__(self):
1003         return iter(self.f)
1004
1005     def write(self, *args):
1006         return self.f.write(*args)
1007
1008     def read(self, *args):
1009         return self.f.read(*args)
1010
1011
1012 def get_filesystem_encoding():
1013     encoding = sys.getfilesystemencoding()
1014     return encoding if encoding is not None else 'utf-8'
1015
1016
1017 def shell_quote(args):
1018     quoted_args = []
1019     encoding = get_filesystem_encoding()
1020     for a in args:
1021         if isinstance(a, bytes):
1022             # We may get a filename encoded with 'encodeFilename'
1023             a = a.decode(encoding)
1024         quoted_args.append(pipes.quote(a))
1025     return ' '.join(quoted_args)
1026
1027
1028 def takewhile_inclusive(pred, seq):
1029     """ Like itertools.takewhile, but include the latest evaluated element
1030         (the first element so that Not pred(e)) """
1031     for e in seq:
1032         yield e
1033         if not pred(e):
1034             return
1035
1036
1037 def smuggle_url(url, data):
1038     """ Pass additional data in a URL for internal use. """
1039
1040     sdata = compat_urllib_parse.urlencode(
1041         {'__youtubedl_smuggle': json.dumps(data)})
1042     return url + '#' + sdata
1043
1044
1045 def unsmuggle_url(smug_url, default=None):
1046     if '#__youtubedl_smuggle' not in smug_url:
1047         return smug_url, default
1048     url, _, sdata = smug_url.rpartition('#')
1049     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1050     data = json.loads(jsond)
1051     return url, data
1052
1053
1054 def format_bytes(bytes):
1055     if bytes is None:
1056         return 'N/A'
1057     if type(bytes) is str:
1058         bytes = float(bytes)
1059     if bytes == 0.0:
1060         exponent = 0
1061     else:
1062         exponent = int(math.log(bytes, 1024.0))
1063     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1064     converted = float(bytes) / float(1024 ** exponent)
1065     return '%.2f%s' % (converted, suffix)
1066
1067
1068 def parse_filesize(s):
1069     if s is None:
1070         return None
1071
1072     # The lower-case forms are of course incorrect and inofficial,
1073     # but we support those too
1074     _UNIT_TABLE = {
1075         'B': 1,
1076         'b': 1,
1077         'KiB': 1024,
1078         'KB': 1000,
1079         'kB': 1024,
1080         'Kb': 1000,
1081         'MiB': 1024 ** 2,
1082         'MB': 1000 ** 2,
1083         'mB': 1024 ** 2,
1084         'Mb': 1000 ** 2,
1085         'GiB': 1024 ** 3,
1086         'GB': 1000 ** 3,
1087         'gB': 1024 ** 3,
1088         'Gb': 1000 ** 3,
1089         'TiB': 1024 ** 4,
1090         'TB': 1000 ** 4,
1091         'tB': 1024 ** 4,
1092         'Tb': 1000 ** 4,
1093         'PiB': 1024 ** 5,
1094         'PB': 1000 ** 5,
1095         'pB': 1024 ** 5,
1096         'Pb': 1000 ** 5,
1097         'EiB': 1024 ** 6,
1098         'EB': 1000 ** 6,
1099         'eB': 1024 ** 6,
1100         'Eb': 1000 ** 6,
1101         'ZiB': 1024 ** 7,
1102         'ZB': 1000 ** 7,
1103         'zB': 1024 ** 7,
1104         'Zb': 1000 ** 7,
1105         'YiB': 1024 ** 8,
1106         'YB': 1000 ** 8,
1107         'yB': 1024 ** 8,
1108         'Yb': 1000 ** 8,
1109     }
1110
1111     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1112     m = re.match(
1113         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1114     if not m:
1115         return None
1116
1117     num_str = m.group('num').replace(',', '.')
1118     mult = _UNIT_TABLE[m.group('unit')]
1119     return int(float(num_str) * mult)
1120
1121
1122 def get_term_width():
1123     columns = compat_getenv('COLUMNS', None)
1124     if columns:
1125         return int(columns)
1126
1127     try:
1128         sp = subprocess.Popen(
1129             ['stty', 'size'],
1130             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1131         out, err = sp.communicate()
1132         return int(out.split()[1])
1133     except:
1134         pass
1135     return None
1136
1137
1138 def month_by_name(name):
1139     """ Return the number of a month by (locale-independently) English name """
1140
1141     ENGLISH_NAMES = [
1142         'January', 'February', 'March', 'April', 'May', 'June',
1143         'July', 'August', 'September', 'October', 'November', 'December']
1144     try:
1145         return ENGLISH_NAMES.index(name) + 1
1146     except ValueError:
1147         return None
1148
1149
1150 def fix_xml_ampersands(xml_str):
1151     """Replace all the '&' by '&amp;' in XML"""
1152     return re.sub(
1153         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1154         '&amp;',
1155         xml_str)
1156
1157
1158 def setproctitle(title):
1159     assert isinstance(title, compat_str)
1160     try:
1161         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1162     except OSError:
1163         return
1164     title_bytes = title.encode('utf-8')
1165     buf = ctypes.create_string_buffer(len(title_bytes))
1166     buf.value = title_bytes
1167     try:
1168         libc.prctl(15, buf, 0, 0, 0)
1169     except AttributeError:
1170         return  # Strange libc, just skip this
1171
1172
1173 def remove_start(s, start):
1174     if s.startswith(start):
1175         return s[len(start):]
1176     return s
1177
1178
1179 def remove_end(s, end):
1180     if s.endswith(end):
1181         return s[:-len(end)]
1182     return s
1183
1184
1185 def url_basename(url):
1186     path = compat_urlparse.urlparse(url).path
1187     return path.strip('/').split('/')[-1]
1188
1189
1190 class HEADRequest(compat_urllib_request.Request):
1191     def get_method(self):
1192         return "HEAD"
1193
1194
1195 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1196     if get_attr:
1197         if v is not None:
1198             v = getattr(v, get_attr, None)
1199     if v == '':
1200         v = None
1201     return default if v is None else (int(v) * invscale // scale)
1202
1203
1204 def str_or_none(v, default=None):
1205     return default if v is None else compat_str(v)
1206
1207
1208 def str_to_int(int_str):
1209     """ A more relaxed version of int_or_none """
1210     if int_str is None:
1211         return None
1212     int_str = re.sub(r'[,\.\+]', '', int_str)
1213     return int(int_str)
1214
1215
1216 def float_or_none(v, scale=1, invscale=1, default=None):
1217     return default if v is None else (float(v) * invscale / scale)
1218
1219
1220 def parse_duration(s):
1221     if s is None:
1222         return None
1223
1224     s = s.strip()
1225
1226     m = re.match(
1227         r'''(?ix)T?
1228         (?:
1229             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1230             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1231
1232             (?:
1233                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1234                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1235             )?
1236             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1237         )$''', s)
1238     if not m:
1239         return None
1240     res = 0
1241     if m.group('only_mins'):
1242         return float_or_none(m.group('only_mins'), invscale=60)
1243     if m.group('only_hours'):
1244         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1245     if m.group('secs'):
1246         res += int(m.group('secs'))
1247     if m.group('mins'):
1248         res += int(m.group('mins')) * 60
1249     if m.group('hours'):
1250         res += int(m.group('hours')) * 60 * 60
1251     if m.group('ms'):
1252         res += float(m.group('ms'))
1253     return res
1254
1255
1256 def prepend_extension(filename, ext):
1257     name, real_ext = os.path.splitext(filename)
1258     return '{0}.{1}{2}'.format(name, ext, real_ext)
1259
1260
1261 def check_executable(exe, args=[]):
1262     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1263     args can be a list of arguments for a short output (like -version) """
1264     try:
1265         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1266     except OSError:
1267         return False
1268     return exe
1269
1270
1271 def get_exe_version(exe, args=['--version'],
1272                     version_re=None, unrecognized='present'):
1273     """ Returns the version of the specified executable,
1274     or False if the executable is not present """
1275     try:
1276         out, _ = subprocess.Popen(
1277             [exe] + args,
1278             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1279     except OSError:
1280         return False
1281     if isinstance(out, bytes):  # Python 2.x
1282         out = out.decode('ascii', 'ignore')
1283     return detect_exe_version(out, version_re, unrecognized)
1284
1285
1286 def detect_exe_version(output, version_re=None, unrecognized='present'):
1287     assert isinstance(output, compat_str)
1288     if version_re is None:
1289         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1290     m = re.search(version_re, output)
1291     if m:
1292         return m.group(1)
1293     else:
1294         return unrecognized
1295
1296
1297 class PagedList(object):
1298     def __len__(self):
1299         # This is only useful for tests
1300         return len(self.getslice())
1301
1302
1303 class OnDemandPagedList(PagedList):
1304     def __init__(self, pagefunc, pagesize):
1305         self._pagefunc = pagefunc
1306         self._pagesize = pagesize
1307
1308     def getslice(self, start=0, end=None):
1309         res = []
1310         for pagenum in itertools.count(start // self._pagesize):
1311             firstid = pagenum * self._pagesize
1312             nextfirstid = pagenum * self._pagesize + self._pagesize
1313             if start >= nextfirstid:
1314                 continue
1315
1316             page_results = list(self._pagefunc(pagenum))
1317
1318             startv = (
1319                 start % self._pagesize
1320                 if firstid <= start < nextfirstid
1321                 else 0)
1322
1323             endv = (
1324                 ((end - 1) % self._pagesize) + 1
1325                 if (end is not None and firstid <= end <= nextfirstid)
1326                 else None)
1327
1328             if startv != 0 or endv is not None:
1329                 page_results = page_results[startv:endv]
1330             res.extend(page_results)
1331
1332             # A little optimization - if current page is not "full", ie. does
1333             # not contain page_size videos then we can assume that this page
1334             # is the last one - there are no more ids on further pages -
1335             # i.e. no need to query again.
1336             if len(page_results) + startv < self._pagesize:
1337                 break
1338
1339             # If we got the whole page, but the next page is not interesting,
1340             # break out early as well
1341             if end == nextfirstid:
1342                 break
1343         return res
1344
1345
1346 class InAdvancePagedList(PagedList):
1347     def __init__(self, pagefunc, pagecount, pagesize):
1348         self._pagefunc = pagefunc
1349         self._pagecount = pagecount
1350         self._pagesize = pagesize
1351
1352     def getslice(self, start=0, end=None):
1353         res = []
1354         start_page = start // self._pagesize
1355         end_page = (
1356             self._pagecount if end is None else (end // self._pagesize + 1))
1357         skip_elems = start - start_page * self._pagesize
1358         only_more = None if end is None else end - start
1359         for pagenum in range(start_page, end_page):
1360             page = list(self._pagefunc(pagenum))
1361             if skip_elems:
1362                 page = page[skip_elems:]
1363                 skip_elems = None
1364             if only_more is not None:
1365                 if len(page) < only_more:
1366                     only_more -= len(page)
1367                 else:
1368                     page = page[:only_more]
1369                     res.extend(page)
1370                     break
1371             res.extend(page)
1372         return res
1373
1374
1375 def uppercase_escape(s):
1376     unicode_escape = codecs.getdecoder('unicode_escape')
1377     return re.sub(
1378         r'\\U[0-9a-fA-F]{8}',
1379         lambda m: unicode_escape(m.group(0))[0],
1380         s)
1381
1382
1383 def escape_rfc3986(s):
1384     """Escape non-ASCII characters as suggested by RFC 3986"""
1385     if sys.version_info < (3, 0) and isinstance(s, unicode):
1386         s = s.encode('utf-8')
1387     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1388
1389
1390 def escape_url(url):
1391     """Escape URL as suggested by RFC 3986"""
1392     url_parsed = compat_urllib_parse_urlparse(url)
1393     return url_parsed._replace(
1394         path=escape_rfc3986(url_parsed.path),
1395         params=escape_rfc3986(url_parsed.params),
1396         query=escape_rfc3986(url_parsed.query),
1397         fragment=escape_rfc3986(url_parsed.fragment)
1398     ).geturl()
1399
1400 try:
1401     struct.pack('!I', 0)
1402 except TypeError:
1403     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1404     def struct_pack(spec, *args):
1405         if isinstance(spec, compat_str):
1406             spec = spec.encode('ascii')
1407         return struct.pack(spec, *args)
1408
1409     def struct_unpack(spec, *args):
1410         if isinstance(spec, compat_str):
1411             spec = spec.encode('ascii')
1412         return struct.unpack(spec, *args)
1413 else:
1414     struct_pack = struct.pack
1415     struct_unpack = struct.unpack
1416
1417
1418 def read_batch_urls(batch_fd):
1419     def fixup(url):
1420         if not isinstance(url, compat_str):
1421             url = url.decode('utf-8', 'replace')
1422         BOM_UTF8 = '\xef\xbb\xbf'
1423         if url.startswith(BOM_UTF8):
1424             url = url[len(BOM_UTF8):]
1425         url = url.strip()
1426         if url.startswith(('#', ';', ']')):
1427             return False
1428         return url
1429
1430     with contextlib.closing(batch_fd) as fd:
1431         return [url for url in map(fixup, fd) if url]
1432
1433
1434 def urlencode_postdata(*args, **kargs):
1435     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1436
1437
1438 try:
1439     etree_iter = xml.etree.ElementTree.Element.iter
1440 except AttributeError:  # Python <=2.6
1441     etree_iter = lambda n: n.findall('.//*')
1442
1443
1444 def parse_xml(s):
1445     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1446         def doctype(self, name, pubid, system):
1447             pass  # Ignore doctypes
1448
1449     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1450     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1451     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1452     # Fix up XML parser in Python 2.x
1453     if sys.version_info < (3, 0):
1454         for n in etree_iter(tree):
1455             if n.text is not None:
1456                 if not isinstance(n.text, compat_str):
1457                     n.text = n.text.decode('utf-8')
1458     return tree
1459
1460
1461 US_RATINGS = {
1462     'G': 0,
1463     'PG': 10,
1464     'PG-13': 13,
1465     'R': 16,
1466     'NC': 18,
1467 }
1468
1469
1470 def parse_age_limit(s):
1471     if s is None:
1472         return None
1473     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1474     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1475
1476
1477 def strip_jsonp(code):
1478     return re.sub(
1479         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1480
1481
1482 def js_to_json(code):
1483     def fix_kv(m):
1484         v = m.group(0)
1485         if v in ('true', 'false', 'null'):
1486             return v
1487         if v.startswith('"'):
1488             return v
1489         if v.startswith("'"):
1490             v = v[1:-1]
1491             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1492                 '\\\\': '\\\\',
1493                 "\\'": "'",
1494                 '"': '\\"',
1495             }[m.group(0)], v)
1496         return '"%s"' % v
1497
1498     res = re.sub(r'''(?x)
1499         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1500         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1501         [a-zA-Z_][a-zA-Z_0-9]*
1502         ''', fix_kv, code)
1503     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1504     return res
1505
1506
1507 def qualities(quality_ids):
1508     """ Get a numeric quality value out of a list of possible values """
1509     def q(qid):
1510         try:
1511             return quality_ids.index(qid)
1512         except ValueError:
1513             return -1
1514     return q
1515
1516
1517 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1518
1519
1520 def limit_length(s, length):
1521     """ Add ellipses to overly long strings """
1522     if s is None:
1523         return None
1524     ELLIPSES = '...'
1525     if len(s) > length:
1526         return s[:length - len(ELLIPSES)] + ELLIPSES
1527     return s
1528
1529
1530 def version_tuple(v):
1531     return tuple(int(e) for e in re.split(r'[-.]', v))
1532
1533
1534 def is_outdated_version(version, limit, assume_new=True):
1535     if not version:
1536         return not assume_new
1537     try:
1538         return version_tuple(version) < version_tuple(limit)
1539     except ValueError:
1540         return not assume_new
1541
1542
1543 def ytdl_is_updateable():
1544     """ Returns if youtube-dl can be updated with -U """
1545     from zipimport import zipimporter
1546
1547     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1548
1549
1550 def args_to_str(args):
1551     # Get a short string representation for a subprocess command
1552     return ' '.join(shlex_quote(a) for a in args)
1553
1554
1555 def urlhandle_detect_ext(url_handle):
1556     try:
1557         url_handle.headers
1558         getheader = lambda h: url_handle.headers[h]
1559     except AttributeError:  # Python < 3
1560         getheader = url_handle.info().getheader
1561
1562     return getheader('Content-Type').split("/")[1]