youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44     shlex_quote,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59
  60 def preferredencoding():
  61     """Get preferred encoding.
  62
  63     Returns the best encoding scheme for the system, based on
  64     locale.getpreferredencoding() and some further tweaks.
  65     """
  66     try:
  67         pref = locale.getpreferredencoding()
  68         'TEST'.encode(pref)
  69     except:
  70         pref = 'UTF-8'
  71
  72     return pref
  73
  74
  75 def write_json_file(obj, fn):
  76     """ Encode obj as JSON and write it to fn, atomically if possible """
  77
  78     fn = encodeFilename(fn)
  79     if sys.version_info < (3, 0) and sys.platform != 'win32':
  80         encoding = get_filesystem_encoding()
  81         # os.path.basename returns a bytes object, but NamedTemporaryFile
  82         # will fail if the filename contains non ascii characters unless we
  83         # use a unicode object
  84         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  85         # the same for os.path.dirname
  86         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
  87     else:
  88         path_basename = os.path.basename
  89         path_dirname = os.path.dirname
  90
  91     args = {
  92         'suffix': '.tmp',
  93         'prefix': path_basename(fn) + '.',
  94         'dir': path_dirname(fn),
  95         'delete': False,
  96     }
  97
  98     # In Python 2.x, json.dump expects a bytestream.
  99     # In Python 3.x, it writes to a character stream
 100     if sys.version_info < (3, 0):
 101         args['mode'] = 'wb'
 102     else:
 103         args.update({
 104             'mode': 'w',
 105             'encoding': 'utf-8',
 106         })
 107
 108     tf = tempfile.NamedTemporaryFile(**args)
 109
 110     try:
 111         with tf:
 112             json.dump(obj, tf)
 113         if sys.platform == 'win32':
 114             # Need to remove existing file on Windows, else os.rename raises
 115             # WindowsError or FileExistsError.
 116             try:
 117                 os.unlink(fn)
 118             except OSError:
 119                 pass
 120         os.rename(tf.name, fn)
 121     except:
 122         try:
 123             os.remove(tf.name)
 124         except OSError:
 125             pass
 126         raise
 127
 128
 129 if sys.version_info >= (2, 7):
 130     def find_xpath_attr(node, xpath, key, val):
 131         """ Find the xpath xpath[@key=val] """
 132         assert re.match(r'^[a-zA-Z-]+$', key)
 133         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 134         expr = xpath + "[@%s='%s']" % (key, val)
 135         return node.find(expr)
 136 else:
 137     def find_xpath_attr(node, xpath, key, val):
 138         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 139         # .//node does not match if a node is a direct child of . !
 140         if isinstance(xpath, unicode):
 141             xpath = xpath.encode('ascii')
 142
 143         for f in node.findall(xpath):
 144             if f.attrib.get(key) == val:
 145                 return f
 146         return None
 147
 148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 149 # the namespace parameter
 150
 151
 152 def xpath_with_ns(path, ns_map):
 153     components = [c.split(':') for c in path.split('/')]
 154     replaced = []
 155     for c in components:
 156         if len(c) == 1:
 157             replaced.append(c[0])
 158         else:
 159             ns, tag = c
 160             replaced.append('{%s}%s' % (ns_map[ns], tag))
 161     return '/'.join(replaced)
 162
 163
 164 def xpath_text(node, xpath, name=None, fatal=False):
 165     if sys.version_info < (2, 7):  # Crazy 2.6
 166         xpath = xpath.encode('ascii')
 167
 168     n = node.find(xpath)
 169     if n is None or n.text is None:
 170         if fatal:
 171             name = xpath if name is None else name
 172             raise ExtractorError('Could not find XML element %s' % name)
 173         else:
 174             return None
 175     return n.text
 176
 177
 178 def get_element_by_id(id, html):
 179     """Return the content of the tag with the specified ID in the passed HTML document"""
 180     return get_element_by_attribute("id", id, html)
 181
 182
 183 def get_element_by_attribute(attribute, value, html):
 184     """Return the content of the tag with the specified attribute in the passed HTML document"""
 185
 186     m = re.search(r'''(?xs)
 187         <([a-zA-Z0-9:._-]+)
 188          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 189          \s+%s=['"]?%s['"]?
 190          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 191         \s*>
 192         (?P<content>.*?)
 193         </\1>
 194     ''' % (re.escape(attribute), re.escape(value)), html)
 195
 196     if not m:
 197         return None
 198     res = m.group('content')
 199
 200     if res.startswith('"') or res.startswith("'"):
 201         res = res[1:-1]
 202
 203     return unescapeHTML(res)
 204
 205
 206 def clean_html(html):
 207     """Clean an HTML snippet into a readable string"""
 208     # Newline vs <br />
 209     html = html.replace('\n', ' ')
 210     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 211     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 212     # Strip html tags
 213     html = re.sub('<.*?>', '', html)
 214     # Replace html entities
 215     html = unescapeHTML(html)
 216     return html.strip()
 217
 218
 219 def sanitize_open(filename, open_mode):
 220     """Try to open the given filename, and slightly tweak it if this fails.
 221
 222     Attempts to open the given filename. If this fails, it tries to change
 223     the filename slightly, step by step, until it's either able to open it
 224     or it fails and raises a final exception, like the standard open()
 225     function.
 226
 227     It returns the tuple (stream, definitive_file_name).
 228     """
 229     try:
 230         if filename == '-':
 231             if sys.platform == 'win32':
 232                 import msvcrt
 233                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 234             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 235         stream = open(encodeFilename(filename), open_mode)
 236         return (stream, filename)
 237     except (IOError, OSError) as err:
 238         if err.errno in (errno.EACCES,):
 239             raise
 240
 241         # In case of error, try to remove win32 forbidden chars
 242         alt_filename = os.path.join(
 243             re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
 244             for path_part in os.path.split(filename)
 245         )
 246         if alt_filename == filename:
 247             raise
 248         else:
 249             # An exception here should be caught in the caller
 250             stream = open(encodeFilename(filename), open_mode)
 251             return (stream, alt_filename)
 252
 253
 254 def timeconvert(timestr):
 255     """Convert RFC 2822 defined time string into system timestamp"""
 256     timestamp = None
 257     timetuple = email.utils.parsedate_tz(timestr)
 258     if timetuple is not None:
 259         timestamp = email.utils.mktime_tz(timetuple)
 260     return timestamp
 261
 262
 263 def sanitize_filename(s, restricted=False, is_id=False):
 264     """Sanitizes a string so it could be used as part of a filename.
 265     If restricted is set, use a stricter subset of allowed characters.
 266     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 267     """
 268     def replace_insane(char):
 269         if char == '?' or ord(char) < 32 or ord(char) == 127:
 270             return ''
 271         elif char == '"':
 272             return '' if restricted else '\''
 273         elif char == ':':
 274             return '_-' if restricted else ' -'
 275         elif char in '\\/|*<>':
 276             return '_'
 277         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 278             return '_'
 279         if restricted and ord(char) > 127:
 280             return '_'
 281         return char
 282
 283     result = ''.join(map(replace_insane, s))
 284     if not is_id:
 285         while '__' in result:
 286             result = result.replace('__', '_')
 287         result = result.strip('_')
 288         # Common case of "Foreign band name - English song title"
 289         if restricted and result.startswith('-_'):
 290             result = result[2:]
 291         if not result:
 292             result = '_'
 293     return result
 294
 295
 296 def orderedSet(iterable):
 297     """ Remove all duplicates from the input iterable """
 298     res = []
 299     for el in iterable:
 300         if el not in res:
 301             res.append(el)
 302     return res
 303
 304
 305 def _htmlentity_transform(entity):
 306     """Transforms an HTML entity to a character."""
 307     # Known non-numeric HTML entity
 308     if entity in compat_html_entities.name2codepoint:
 309         return compat_chr(compat_html_entities.name2codepoint[entity])
 310
 311     mobj = re.match(r'#(x?[0-9]+)', entity)
 312     if mobj is not None:
 313         numstr = mobj.group(1)
 314         if numstr.startswith('x'):
 315             base = 16
 316             numstr = '0%s' % numstr
 317         else:
 318             base = 10
 319         return compat_chr(int(numstr, base))
 320
 321     # Unknown entity in name, return its literal representation
 322     return ('&%s;' % entity)
 323
 324
 325 def unescapeHTML(s):
 326     if s is None:
 327         return None
 328     assert type(s) == compat_str
 329
 330     return re.sub(
 331         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 332
 333
 334 def encodeFilename(s, for_subprocess=False):
 335     """
 336     @param s The name of the file
 337     """
 338
 339     assert type(s) == compat_str
 340
 341     # Python 3 has a Unicode API
 342     if sys.version_info >= (3, 0):
 343         return s
 344
 345     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 346         # Pass '' directly to use Unicode APIs on Windows 2000 and up
 347         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 348         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 349         if not for_subprocess:
 350             return s
 351         else:
 352             # For subprocess calls, encode with locale encoding
 353             # Refer to http://stackoverflow.com/a/9951851/35070
 354             encoding = preferredencoding()
 355     else:
 356         encoding = sys.getfilesystemencoding()
 357     if encoding is None:
 358         encoding = 'utf-8'
 359     return s.encode(encoding, 'ignore')
 360
 361
 362 def encodeArgument(s):
 363     if not isinstance(s, compat_str):
 364         # Legacy code that uses byte strings
 365         # Uncomment the following line after fixing all post processors
 366         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 367         s = s.decode('ascii')
 368     return encodeFilename(s, True)
 369
 370
 371 def decodeOption(optval):
 372     if optval is None:
 373         return optval
 374     if isinstance(optval, bytes):
 375         optval = optval.decode(preferredencoding())
 376
 377     assert isinstance(optval, compat_str)
 378     return optval
 379
 380
 381 def formatSeconds(secs):
 382     if secs > 3600:
 383         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 384     elif secs > 60:
 385         return '%d:%02d' % (secs // 60, secs % 60)
 386     else:
 387         return '%d' % secs
 388
 389
 390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 391     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 392         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 393         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 394         if opts_no_check_certificate:
 395             context.verify_mode = ssl.CERT_NONE
 396         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 397     elif sys.version_info < (3, 2):
 398         import httplib
 399
 400         class HTTPSConnectionV3(httplib.HTTPSConnection):
 401             def __init__(self, *args, **kwargs):
 402                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 403
 404             def connect(self):
 405                 sock = socket.create_connection((self.host, self.port), self.timeout)
 406                 if getattr(self, '_tunnel_host', False):
 407                     self.sock = sock
 408                     self._tunnel()
 409                 try:
 410                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 411                 except ssl.SSLError:
 412                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 413
 414         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 415             def https_open(self, req):
 416                 return self.do_open(HTTPSConnectionV3, req)
 417         return HTTPSHandlerV3(**kwargs)
 418     else:  # Python < 3.4
 419         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 420         context.verify_mode = (ssl.CERT_NONE
 421                                if opts_no_check_certificate
 422                                else ssl.CERT_REQUIRED)
 423         context.set_default_verify_paths()
 424         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 425
 426
 427 class ExtractorError(Exception):
 428     """Error during info extraction."""
 429
 430     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 431         """ tb, if given, is the original traceback (so that it can be printed out).
 432         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 433         """
 434
 435         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 436             expected = True
 437         if video_id is not None:
 438             msg = video_id + ': ' + msg
 439         if cause:
 440             msg += ' (caused by %r)' % cause
 441         if not expected:
 442             if ytdl_is_updateable():
 443                 update_cmd = 'type  youtube-dl -U  to update'
 444             else:
 445                 update_cmd = 'see  https://yt-dl.org/update  on how to update'
 446             msg += '; please report this issue on https://yt-dl.org/bug .'
 447             msg += ' Make sure you are using the latest version; %s.' % update_cmd
 448             msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 449         super(ExtractorError, self).__init__(msg)
 450
 451         self.traceback = tb
 452         self.exc_info = sys.exc_info()  # preserve original exception
 453         self.cause = cause
 454         self.video_id = video_id
 455
 456     def format_traceback(self):
 457         if self.traceback is None:
 458             return None
 459         return ''.join(traceback.format_tb(self.traceback))
 460
 461
 462 class RegexNotFoundError(ExtractorError):
 463     """Error when a regex didn't match"""
 464     pass
 465
 466
 467 class DownloadError(Exception):
 468     """Download Error exception.
 469
 470     This exception may be thrown by FileDownloader objects if they are not
 471     configured to continue on errors. They will contain the appropriate
 472     error message.
 473     """
 474
 475     def __init__(self, msg, exc_info=None):
 476         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 477         super(DownloadError, self).__init__(msg)
 478         self.exc_info = exc_info
 479
 480
 481 class SameFileError(Exception):
 482     """Same File exception.
 483
 484     This exception will be thrown by FileDownloader objects if they detect
 485     multiple files would have to be downloaded to the same file on disk.
 486     """
 487     pass
 488
 489
 490 class PostProcessingError(Exception):
 491     """Post Processing exception.
 492
 493     This exception may be raised by PostProcessor's .run() method to
 494     indicate an error in the postprocessing task.
 495     """
 496
 497     def __init__(self, msg):
 498         self.msg = msg
 499
 500
 501 class MaxDownloadsReached(Exception):
 502     """ --max-downloads limit has been reached. """
 503     pass
 504
 505
 506 class UnavailableVideoError(Exception):
 507     """Unavailable Format exception.
 508
 509     This exception will be thrown when a video is requested
 510     in a format that is not available for that video.
 511     """
 512     pass
 513
 514
 515 class ContentTooShortError(Exception):
 516     """Content Too Short exception.
 517
 518     This exception may be raised by FileDownloader objects when a file they
 519     download is too small for what the server announced first, indicating
 520     the connection was probably interrupted.
 521     """
 522     # Both in bytes
 523     downloaded = None
 524     expected = None
 525
 526     def __init__(self, downloaded, expected):
 527         self.downloaded = downloaded
 528         self.expected = expected
 529
 530
 531 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 532     """Handler for HTTP requests and responses.
 533
 534     This class, when installed with an OpenerDirector, automatically adds
 535     the standard headers to every HTTP request and handles gzipped and
 536     deflated responses from web servers. If compression is to be avoided in
 537     a particular request, the original request in the program code only has
 538     to include the HTTP header "Youtubedl-No-Compression", which will be
 539     removed before making the real request.
 540
 541     Part of this code was copied from:
 542
 543     http://techknack.net/python-urllib2-handlers/
 544
 545     Andrew Rowls, the author of that code, agreed to release it to the
 546     public domain.
 547     """
 548
 549     @staticmethod
 550     def deflate(data):
 551         try:
 552             return zlib.decompress(data, -zlib.MAX_WBITS)
 553         except zlib.error:
 554             return zlib.decompress(data)
 555
 556     @staticmethod
 557     def addinfourl_wrapper(stream, headers, url, code):
 558         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 559             return compat_urllib_request.addinfourl(stream, headers, url, code)
 560         ret = compat_urllib_request.addinfourl(stream, headers, url)
 561         ret.code = code
 562         return ret
 563
 564     def http_request(self, req):
 565         for h, v in std_headers.items():
 566             if h not in req.headers:
 567                 req.add_header(h, v)
 568         if 'Youtubedl-no-compression' in req.headers:
 569             if 'Accept-encoding' in req.headers:
 570                 del req.headers['Accept-encoding']
 571             del req.headers['Youtubedl-no-compression']
 572         if 'Youtubedl-user-agent' in req.headers:
 573             if 'User-agent' in req.headers:
 574                 del req.headers['User-agent']
 575             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 576             del req.headers['Youtubedl-user-agent']
 577
 578         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 579             # Python 2.6 is brain-dead when it comes to fragments
 580             req._Request__original = req._Request__original.partition('#')[0]
 581             req._Request__r_type = req._Request__r_type.partition('#')[0]
 582
 583         return req
 584
 585     def http_response(self, req, resp):
 586         old_resp = resp
 587         # gzip
 588         if resp.headers.get('Content-encoding', '') == 'gzip':
 589             content = resp.read()
 590             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 591             try:
 592                 uncompressed = io.BytesIO(gz.read())
 593             except IOError as original_ioerror:
 594                 # There may be junk add the end of the file
 595                 # See http://stackoverflow.com/q/4928560/35070 for details
 596                 for i in range(1, 1024):
 597                     try:
 598                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 599                         uncompressed = io.BytesIO(gz.read())
 600                     except IOError:
 601                         continue
 602                     break
 603                 else:
 604                     raise original_ioerror
 605             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 606             resp.msg = old_resp.msg
 607         # deflate
 608         if resp.headers.get('Content-encoding', '') == 'deflate':
 609             gz = io.BytesIO(self.deflate(resp.read()))
 610             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 611             resp.msg = old_resp.msg
 612         return resp
 613
 614     https_request = http_request
 615     https_response = http_response
 616
 617
 618 def parse_iso8601(date_str, delimiter='T'):
 619     """ Return a UNIX timestamp from the given date """
 620
 621     if date_str is None:
 622         return None
 623
 624     m = re.search(
 625         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 626         date_str)
 627     if not m:
 628         timezone = datetime.timedelta()
 629     else:
 630         date_str = date_str[:-len(m.group(0))]
 631         if not m.group('sign'):
 632             timezone = datetime.timedelta()
 633         else:
 634             sign = 1 if m.group('sign') == '+' else -1
 635             timezone = datetime.timedelta(
 636                 hours=sign * int(m.group('hours')),
 637                 minutes=sign * int(m.group('minutes')))
 638     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 639     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 640     return calendar.timegm(dt.timetuple())
 641
 642
 643 def unified_strdate(date_str, day_first=True):
 644     """Return a string with the date in the format YYYYMMDD"""
 645
 646     if date_str is None:
 647         return None
 648     upload_date = None
 649     # Replace commas
 650     date_str = date_str.replace(',', ' ')
 651     # %z (UTC offset) is only supported in python>=3.2
 652     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 653     # Remove AM/PM + timezone
 654     date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
 655
 656     format_expressions = [
 657         '%d %B %Y',
 658         '%d %b %Y',
 659         '%B %d %Y',
 660         '%b %d %Y',
 661         '%b %dst %Y %I:%M%p',
 662         '%b %dnd %Y %I:%M%p',
 663         '%b %dth %Y %I:%M%p',
 664         '%Y-%m-%d',
 665         '%Y/%m/%d',
 666         '%d.%m.%Y',
 667         '%d/%m/%Y',
 668         '%d/%m/%y',
 669         '%Y/%m/%d %H:%M:%S',
 670         '%Y-%m-%d %H:%M:%S',
 671         '%Y-%m-%d %H:%M:%S.%f',
 672         '%d.%m.%Y %H:%M',
 673         '%d.%m.%Y %H.%M',
 674         '%Y-%m-%dT%H:%M:%SZ',
 675         '%Y-%m-%dT%H:%M:%S.%fZ',
 676         '%Y-%m-%dT%H:%M:%S.%f0Z',
 677         '%Y-%m-%dT%H:%M:%S',
 678         '%Y-%m-%dT%H:%M:%S.%f',
 679         '%Y-%m-%dT%H:%M',
 680     ]
 681     if day_first:
 682         format_expressions.extend([
 683             '%d/%m/%Y %H:%M:%S',
 684         ])
 685     else:
 686         format_expressions.extend([
 687             '%m/%d/%Y %H:%M:%S',
 688         ])
 689     for expression in format_expressions:
 690         try:
 691             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 692         except ValueError:
 693             pass
 694     if upload_date is None:
 695         timetuple = email.utils.parsedate_tz(date_str)
 696         if timetuple:
 697             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 698     return upload_date
 699
 700
 701 def determine_ext(url, default_ext='unknown_video'):
 702     if url is None:
 703         return default_ext
 704     guess = url.partition('?')[0].rpartition('.')[2]
 705     if re.match(r'^[A-Za-z0-9]+$', guess):
 706         return guess
 707     else:
 708         return default_ext
 709
 710
 711 def subtitles_filename(filename, sub_lang, sub_format):
 712     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 713
 714
 715 def date_from_str(date_str):
 716     """
 717     Return a datetime object from a string in the format YYYYMMDD or
 718     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 719     today = datetime.date.today()
 720     if date_str in ('now', 'today'):
 721         return today
 722     if date_str == 'yesterday':
 723         return today - datetime.timedelta(days=1)
 724     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 725     if match is not None:
 726         sign = match.group('sign')
 727         time = int(match.group('time'))
 728         if sign == '-':
 729             time = -time
 730         unit = match.group('unit')
 731         # A bad aproximation?
 732         if unit == 'month':
 733             unit = 'day'
 734             time *= 30
 735         elif unit == 'year':
 736             unit = 'day'
 737             time *= 365
 738         unit += 's'
 739         delta = datetime.timedelta(**{unit: time})
 740         return today + delta
 741     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 742
 743
 744 def hyphenate_date(date_str):
 745     """
 746     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 747     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 748     if match is not None:
 749         return '-'.join(match.groups())
 750     else:
 751         return date_str
 752
 753
 754 class DateRange(object):
 755     """Represents a time interval between two dates"""
 756
 757     def __init__(self, start=None, end=None):
 758         """start and end must be strings in the format accepted by date"""
 759         if start is not None:
 760             self.start = date_from_str(start)
 761         else:
 762             self.start = datetime.datetime.min.date()
 763         if end is not None:
 764             self.end = date_from_str(end)
 765         else:
 766             self.end = datetime.datetime.max.date()
 767         if self.start > self.end:
 768             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 769
 770     @classmethod
 771     def day(cls, day):
 772         """Returns a range that only contains the given day"""
 773         return cls(day, day)
 774
 775     def __contains__(self, date):
 776         """Check if the date is in the range"""
 777         if not isinstance(date, datetime.date):
 778             date = date_from_str(date)
 779         return self.start <= date <= self.end
 780
 781     def __str__(self):
 782         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 783
 784
 785 def platform_name():
 786     """ Returns the platform name as a compat_str """
 787     res = platform.platform()
 788     if isinstance(res, bytes):
 789         res = res.decode(preferredencoding())
 790
 791     assert isinstance(res, compat_str)
 792     return res
 793
 794
 795 def _windows_write_string(s, out):
 796     """ Returns True if the string was written using special methods,
 797     False if it has yet to be written out."""
 798     # Adapted from http://stackoverflow.com/a/3259271/35070
 799
 800     import ctypes
 801     import ctypes.wintypes
 802
 803     WIN_OUTPUT_IDS = {
 804         1: -11,
 805         2: -12,
 806     }
 807
 808     try:
 809         fileno = out.fileno()
 810     except AttributeError:
 811         # If the output stream doesn't have a fileno, it's virtual
 812         return False
 813     if fileno not in WIN_OUTPUT_IDS:
 814         return False
 815
 816     GetStdHandle = ctypes.WINFUNCTYPE(
 817         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 818         (b"GetStdHandle", ctypes.windll.kernel32))
 819     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 820
 821     WriteConsoleW = ctypes.WINFUNCTYPE(
 822         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 823         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 824         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
 825     written = ctypes.wintypes.DWORD(0)
 826
 827     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
 828     FILE_TYPE_CHAR = 0x0002
 829     FILE_TYPE_REMOTE = 0x8000
 830     GetConsoleMode = ctypes.WINFUNCTYPE(
 831         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 832         ctypes.POINTER(ctypes.wintypes.DWORD))(
 833         (b"GetConsoleMode", ctypes.windll.kernel32))
 834     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 835
 836     def not_a_console(handle):
 837         if handle == INVALID_HANDLE_VALUE or handle is None:
 838             return True
 839         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 840                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 841
 842     if not_a_console(h):
 843         return False
 844
 845     def next_nonbmp_pos(s):
 846         try:
 847             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 848         except StopIteration:
 849             return len(s)
 850
 851     while s:
 852         count = min(next_nonbmp_pos(s), 1024)
 853
 854         ret = WriteConsoleW(
 855             h, s, count if count else 2, ctypes.byref(written), None)
 856         if ret == 0:
 857             raise OSError('Failed to write string')
 858         if not count:  # We just wrote a non-BMP character
 859             assert written.value == 2
 860             s = s[1:]
 861         else:
 862             assert written.value > 0
 863             s = s[written.value:]
 864     return True
 865
 866
 867 def write_string(s, out=None, encoding=None):
 868     if out is None:
 869         out = sys.stderr
 870     assert type(s) == compat_str
 871
 872     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 873         if _windows_write_string(s, out):
 874             return
 875
 876     if ('b' in getattr(out, 'mode', '') or
 877             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 878         byt = s.encode(encoding or preferredencoding(), 'ignore')
 879         out.write(byt)
 880     elif hasattr(out, 'buffer'):
 881         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 882         byt = s.encode(enc, 'ignore')
 883         out.buffer.write(byt)
 884     else:
 885         out.write(s)
 886     out.flush()
 887
 888
 889 def bytes_to_intlist(bs):
 890     if not bs:
 891         return []
 892     if isinstance(bs[0], int):  # Python 3
 893         return list(bs)
 894     else:
 895         return [ord(c) for c in bs]
 896
 897
 898 def intlist_to_bytes(xs):
 899     if not xs:
 900         return b''
 901     return struct_pack('%dB' % len(xs), *xs)
 902
 903
 904 # Cross-platform file locking
 905 if sys.platform == 'win32':
 906     import ctypes.wintypes
 907     import msvcrt
 908
 909     class OVERLAPPED(ctypes.Structure):
 910         _fields_ = [
 911             ('Internal', ctypes.wintypes.LPVOID),
 912             ('InternalHigh', ctypes.wintypes.LPVOID),
 913             ('Offset', ctypes.wintypes.DWORD),
 914             ('OffsetHigh', ctypes.wintypes.DWORD),
 915             ('hEvent', ctypes.wintypes.HANDLE),
 916         ]
 917
 918     kernel32 = ctypes.windll.kernel32
 919     LockFileEx = kernel32.LockFileEx
 920     LockFileEx.argtypes = [
 921         ctypes.wintypes.HANDLE,     # hFile
 922         ctypes.wintypes.DWORD,      # dwFlags
 923         ctypes.wintypes.DWORD,      # dwReserved
 924         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 925         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 926         ctypes.POINTER(OVERLAPPED)  # Overlapped
 927     ]
 928     LockFileEx.restype = ctypes.wintypes.BOOL
 929     UnlockFileEx = kernel32.UnlockFileEx
 930     UnlockFileEx.argtypes = [
 931         ctypes.wintypes.HANDLE,     # hFile
 932         ctypes.wintypes.DWORD,      # dwReserved
 933         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 934         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 935         ctypes.POINTER(OVERLAPPED)  # Overlapped
 936     ]
 937     UnlockFileEx.restype = ctypes.wintypes.BOOL
 938     whole_low = 0xffffffff
 939     whole_high = 0x7fffffff
 940
 941     def _lock_file(f, exclusive):
 942         overlapped = OVERLAPPED()
 943         overlapped.Offset = 0
 944         overlapped.OffsetHigh = 0
 945         overlapped.hEvent = 0
 946         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 947         handle = msvcrt.get_osfhandle(f.fileno())
 948         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 949                           whole_low, whole_high, f._lock_file_overlapped_p):
 950             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 951
 952     def _unlock_file(f):
 953         assert f._lock_file_overlapped_p
 954         handle = msvcrt.get_osfhandle(f.fileno())
 955         if not UnlockFileEx(handle, 0,
 956                             whole_low, whole_high, f._lock_file_overlapped_p):
 957             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 958
 959 else:
 960     import fcntl
 961
 962     def _lock_file(f, exclusive):
 963         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 964
 965     def _unlock_file(f):
 966         fcntl.flock(f, fcntl.LOCK_UN)
 967
 968
 969 class locked_file(object):
 970     def __init__(self, filename, mode, encoding=None):
 971         assert mode in ['r', 'a', 'w']
 972         self.f = io.open(filename, mode, encoding=encoding)
 973         self.mode = mode
 974
 975     def __enter__(self):
 976         exclusive = self.mode != 'r'
 977         try:
 978             _lock_file(self.f, exclusive)
 979         except IOError:
 980             self.f.close()
 981             raise
 982         return self
 983
 984     def __exit__(self, etype, value, traceback):
 985         try:
 986             _unlock_file(self.f)
 987         finally:
 988             self.f.close()
 989
 990     def __iter__(self):
 991         return iter(self.f)
 992
 993     def write(self, *args):
 994         return self.f.write(*args)
 995
 996     def read(self, *args):
 997         return self.f.read(*args)
 998
 999
1000 def get_filesystem_encoding():
1001     encoding = sys.getfilesystemencoding()
1002     return encoding if encoding is not None else 'utf-8'
1003
1004
1005 def shell_quote(args):
1006     quoted_args = []
1007     encoding = get_filesystem_encoding()
1008     for a in args:
1009         if isinstance(a, bytes):
1010             # We may get a filename encoded with 'encodeFilename'
1011             a = a.decode(encoding)
1012         quoted_args.append(pipes.quote(a))
1013     return ' '.join(quoted_args)
1014
1015
1016 def takewhile_inclusive(pred, seq):
1017     """ Like itertools.takewhile, but include the latest evaluated element
1018         (the first element so that Not pred(e)) """
1019     for e in seq:
1020         yield e
1021         if not pred(e):
1022             return
1023
1024
1025 def smuggle_url(url, data):
1026     """ Pass additional data in a URL for internal use. """
1027
1028     sdata = compat_urllib_parse.urlencode(
1029         {'__youtubedl_smuggle': json.dumps(data)})
1030     return url + '#' + sdata
1031
1032
1033 def unsmuggle_url(smug_url, default=None):
1034     if '#__youtubedl_smuggle' not in smug_url:
1035         return smug_url, default
1036     url, _, sdata = smug_url.rpartition('#')
1037     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1038     data = json.loads(jsond)
1039     return url, data
1040
1041
1042 def format_bytes(bytes):
1043     if bytes is None:
1044         return 'N/A'
1045     if type(bytes) is str:
1046         bytes = float(bytes)
1047     if bytes == 0.0:
1048         exponent = 0
1049     else:
1050         exponent = int(math.log(bytes, 1024.0))
1051     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1052     converted = float(bytes) / float(1024 ** exponent)
1053     return '%.2f%s' % (converted, suffix)
1054
1055
1056 def parse_filesize(s):
1057     if s is None:
1058         return None
1059
1060     # The lower-case forms are of course incorrect and inofficial,
1061     # but we support those too
1062     _UNIT_TABLE = {
1063         'B': 1,
1064         'b': 1,
1065         'KiB': 1024,
1066         'KB': 1000,
1067         'kB': 1024,
1068         'Kb': 1000,
1069         'MiB': 1024 ** 2,
1070         'MB': 1000 ** 2,
1071         'mB': 1024 ** 2,
1072         'Mb': 1000 ** 2,
1073         'GiB': 1024 ** 3,
1074         'GB': 1000 ** 3,
1075         'gB': 1024 ** 3,
1076         'Gb': 1000 ** 3,
1077         'TiB': 1024 ** 4,
1078         'TB': 1000 ** 4,
1079         'tB': 1024 ** 4,
1080         'Tb': 1000 ** 4,
1081         'PiB': 1024 ** 5,
1082         'PB': 1000 ** 5,
1083         'pB': 1024 ** 5,
1084         'Pb': 1000 ** 5,
1085         'EiB': 1024 ** 6,
1086         'EB': 1000 ** 6,
1087         'eB': 1024 ** 6,
1088         'Eb': 1000 ** 6,
1089         'ZiB': 1024 ** 7,
1090         'ZB': 1000 ** 7,
1091         'zB': 1024 ** 7,
1092         'Zb': 1000 ** 7,
1093         'YiB': 1024 ** 8,
1094         'YB': 1000 ** 8,
1095         'yB': 1024 ** 8,
1096         'Yb': 1000 ** 8,
1097     }
1098
1099     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1100     m = re.match(
1101         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1102     if not m:
1103         return None
1104
1105     num_str = m.group('num').replace(',', '.')
1106     mult = _UNIT_TABLE[m.group('unit')]
1107     return int(float(num_str) * mult)
1108
1109
1110 def get_term_width():
1111     columns = compat_getenv('COLUMNS', None)
1112     if columns:
1113         return int(columns)
1114
1115     try:
1116         sp = subprocess.Popen(
1117             ['stty', 'size'],
1118             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1119         out, err = sp.communicate()
1120         return int(out.split()[1])
1121     except:
1122         pass
1123     return None
1124
1125
1126 def month_by_name(name):
1127     """ Return the number of a month by (locale-independently) English name """
1128
1129     ENGLISH_NAMES = [
1130         'January', 'February', 'March', 'April', 'May', 'June',
1131         'July', 'August', 'September', 'October', 'November', 'December']
1132     try:
1133         return ENGLISH_NAMES.index(name) + 1
1134     except ValueError:
1135         return None
1136
1137
1138 def fix_xml_ampersands(xml_str):
1139     """Replace all the '&' by '&amp;' in XML"""
1140     return re.sub(
1141         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1142         '&amp;',
1143         xml_str)
1144
1145
1146 def setproctitle(title):
1147     assert isinstance(title, compat_str)
1148     try:
1149         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1150     except OSError:
1151         return
1152     title_bytes = title.encode('utf-8')
1153     buf = ctypes.create_string_buffer(len(title_bytes))
1154     buf.value = title_bytes
1155     try:
1156         libc.prctl(15, buf, 0, 0, 0)
1157     except AttributeError:
1158         return  # Strange libc, just skip this
1159
1160
1161 def remove_start(s, start):
1162     if s.startswith(start):
1163         return s[len(start):]
1164     return s
1165
1166
1167 def remove_end(s, end):
1168     if s.endswith(end):
1169         return s[:-len(end)]
1170     return s
1171
1172
1173 def url_basename(url):
1174     path = compat_urlparse.urlparse(url).path
1175     return path.strip('/').split('/')[-1]
1176
1177
1178 class HEADRequest(compat_urllib_request.Request):
1179     def get_method(self):
1180         return "HEAD"
1181
1182
1183 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1184     if get_attr:
1185         if v is not None:
1186             v = getattr(v, get_attr, None)
1187     if v == '':
1188         v = None
1189     return default if v is None else (int(v) * invscale // scale)
1190
1191
1192 def str_or_none(v, default=None):
1193     return default if v is None else compat_str(v)
1194
1195
1196 def str_to_int(int_str):
1197     """ A more relaxed version of int_or_none """
1198     if int_str is None:
1199         return None
1200     int_str = re.sub(r'[,\.\+]', '', int_str)
1201     return int(int_str)
1202
1203
1204 def float_or_none(v, scale=1, invscale=1, default=None):
1205     return default if v is None else (float(v) * invscale / scale)
1206
1207
1208 def parse_duration(s):
1209     if s is None:
1210         return None
1211
1212     s = s.strip()
1213
1214     m = re.match(
1215         r'''(?ix)T?
1216         (?:
1217             (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1218             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1219
1220             (?:
1221                 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1222                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1223             )?
1224             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1225         )$''', s)
1226     if not m:
1227         return None
1228     res = 0
1229     if m.group('only_mins'):
1230         return float_or_none(m.group('only_mins'), invscale=60)
1231     if m.group('only_hours'):
1232         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1233     if m.group('secs'):
1234         res += int(m.group('secs'))
1235     if m.group('mins'):
1236         res += int(m.group('mins')) * 60
1237     if m.group('hours'):
1238         res += int(m.group('hours')) * 60 * 60
1239     if m.group('ms'):
1240         res += float(m.group('ms'))
1241     return res
1242
1243
1244 def prepend_extension(filename, ext):
1245     name, real_ext = os.path.splitext(filename)
1246     return '{0}.{1}{2}'.format(name, ext, real_ext)
1247
1248
1249 def check_executable(exe, args=[]):
1250     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1251     args can be a list of arguments for a short output (like -version) """
1252     try:
1253         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1254     except OSError:
1255         return False
1256     return exe
1257
1258
1259 def get_exe_version(exe, args=['--version'],
1260                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1261                     unrecognized='present'):
1262     """ Returns the version of the specified executable,
1263     or False if the executable is not present """
1264     try:
1265         out, err = subprocess.Popen(
1266             [exe] + args,
1267             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1268     except OSError:
1269         return False
1270     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1271     m = re.search(version_re, firstline)
1272     if m:
1273         return m.group(1)
1274     else:
1275         return unrecognized
1276
1277
1278 class PagedList(object):
1279     def __len__(self):
1280         # This is only useful for tests
1281         return len(self.getslice())
1282
1283
1284 class OnDemandPagedList(PagedList):
1285     def __init__(self, pagefunc, pagesize):
1286         self._pagefunc = pagefunc
1287         self._pagesize = pagesize
1288
1289     def getslice(self, start=0, end=None):
1290         res = []
1291         for pagenum in itertools.count(start // self._pagesize):
1292             firstid = pagenum * self._pagesize
1293             nextfirstid = pagenum * self._pagesize + self._pagesize
1294             if start >= nextfirstid:
1295                 continue
1296
1297             page_results = list(self._pagefunc(pagenum))
1298
1299             startv = (
1300                 start % self._pagesize
1301                 if firstid <= start < nextfirstid
1302                 else 0)
1303
1304             endv = (
1305                 ((end - 1) % self._pagesize) + 1
1306                 if (end is not None and firstid <= end <= nextfirstid)
1307                 else None)
1308
1309             if startv != 0 or endv is not None:
1310                 page_results = page_results[startv:endv]
1311             res.extend(page_results)
1312
1313             # A little optimization - if current page is not "full", ie. does
1314             # not contain page_size videos then we can assume that this page
1315             # is the last one - there are no more ids on further pages -
1316             # i.e. no need to query again.
1317             if len(page_results) + startv < self._pagesize:
1318                 break
1319
1320             # If we got the whole page, but the next page is not interesting,
1321             # break out early as well
1322             if end == nextfirstid:
1323                 break
1324         return res
1325
1326
1327 class InAdvancePagedList(PagedList):
1328     def __init__(self, pagefunc, pagecount, pagesize):
1329         self._pagefunc = pagefunc
1330         self._pagecount = pagecount
1331         self._pagesize = pagesize
1332
1333     def getslice(self, start=0, end=None):
1334         res = []
1335         start_page = start // self._pagesize
1336         end_page = (
1337             self._pagecount if end is None else (end // self._pagesize + 1))
1338         skip_elems = start - start_page * self._pagesize
1339         only_more = None if end is None else end - start
1340         for pagenum in range(start_page, end_page):
1341             page = list(self._pagefunc(pagenum))
1342             if skip_elems:
1343                 page = page[skip_elems:]
1344                 skip_elems = None
1345             if only_more is not None:
1346                 if len(page) < only_more:
1347                     only_more -= len(page)
1348                 else:
1349                     page = page[:only_more]
1350                     res.extend(page)
1351                     break
1352             res.extend(page)
1353         return res
1354
1355
1356 def uppercase_escape(s):
1357     unicode_escape = codecs.getdecoder('unicode_escape')
1358     return re.sub(
1359         r'\\U[0-9a-fA-F]{8}',
1360         lambda m: unicode_escape(m.group(0))[0],
1361         s)
1362
1363
1364 def escape_rfc3986(s):
1365     """Escape non-ASCII characters as suggested by RFC 3986"""
1366     if sys.version_info < (3, 0) and isinstance(s, unicode):
1367         s = s.encode('utf-8')
1368     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1369
1370
1371 def escape_url(url):
1372     """Escape URL as suggested by RFC 3986"""
1373     url_parsed = compat_urllib_parse_urlparse(url)
1374     return url_parsed._replace(
1375         path=escape_rfc3986(url_parsed.path),
1376         params=escape_rfc3986(url_parsed.params),
1377         query=escape_rfc3986(url_parsed.query),
1378         fragment=escape_rfc3986(url_parsed.fragment)
1379     ).geturl()
1380
1381 try:
1382     struct.pack('!I', 0)
1383 except TypeError:
1384     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1385     def struct_pack(spec, *args):
1386         if isinstance(spec, compat_str):
1387             spec = spec.encode('ascii')
1388         return struct.pack(spec, *args)
1389
1390     def struct_unpack(spec, *args):
1391         if isinstance(spec, compat_str):
1392             spec = spec.encode('ascii')
1393         return struct.unpack(spec, *args)
1394 else:
1395     struct_pack = struct.pack
1396     struct_unpack = struct.unpack
1397
1398
1399 def read_batch_urls(batch_fd):
1400     def fixup(url):
1401         if not isinstance(url, compat_str):
1402             url = url.decode('utf-8', 'replace')
1403         BOM_UTF8 = '\xef\xbb\xbf'
1404         if url.startswith(BOM_UTF8):
1405             url = url[len(BOM_UTF8):]
1406         url = url.strip()
1407         if url.startswith(('#', ';', ']')):
1408             return False
1409         return url
1410
1411     with contextlib.closing(batch_fd) as fd:
1412         return [url for url in map(fixup, fd) if url]
1413
1414
1415 def urlencode_postdata(*args, **kargs):
1416     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1417
1418
1419 try:
1420     etree_iter = xml.etree.ElementTree.Element.iter
1421 except AttributeError:  # Python <=2.6
1422     etree_iter = lambda n: n.findall('.//*')
1423
1424
1425 def parse_xml(s):
1426     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1427         def doctype(self, name, pubid, system):
1428             pass  # Ignore doctypes
1429
1430     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1431     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1432     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1433     # Fix up XML parser in Python 2.x
1434     if sys.version_info < (3, 0):
1435         for n in etree_iter(tree):
1436             if n.text is not None:
1437                 if not isinstance(n.text, compat_str):
1438                     n.text = n.text.decode('utf-8')
1439     return tree
1440
1441
1442 US_RATINGS = {
1443     'G': 0,
1444     'PG': 10,
1445     'PG-13': 13,
1446     'R': 16,
1447     'NC': 18,
1448 }
1449
1450
1451 def parse_age_limit(s):
1452     if s is None:
1453         return None
1454     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1455     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1456
1457
1458 def strip_jsonp(code):
1459     return re.sub(
1460         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1461
1462
1463 def js_to_json(code):
1464     def fix_kv(m):
1465         v = m.group(0)
1466         if v in ('true', 'false', 'null'):
1467             return v
1468         if v.startswith('"'):
1469             return v
1470         if v.startswith("'"):
1471             v = v[1:-1]
1472             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1473                 '\\\\': '\\\\',
1474                 "\\'": "'",
1475                 '"': '\\"',
1476             }[m.group(0)], v)
1477         return '"%s"' % v
1478
1479     res = re.sub(r'''(?x)
1480         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1481         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1482         [a-zA-Z_][a-zA-Z_0-9]*
1483         ''', fix_kv, code)
1484     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1485     return res
1486
1487
1488 def qualities(quality_ids):
1489     """ Get a numeric quality value out of a list of possible values """
1490     def q(qid):
1491         try:
1492             return quality_ids.index(qid)
1493         except ValueError:
1494             return -1
1495     return q
1496
1497
1498 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1499
1500
1501 def limit_length(s, length):
1502     """ Add ellipses to overly long strings """
1503     if s is None:
1504         return None
1505     ELLIPSES = '...'
1506     if len(s) > length:
1507         return s[:length - len(ELLIPSES)] + ELLIPSES
1508     return s
1509
1510
1511 def version_tuple(v):
1512     return tuple(int(e) for e in re.split(r'[-.]', v))
1513
1514
1515 def is_outdated_version(version, limit, assume_new=True):
1516     if not version:
1517         return not assume_new
1518     try:
1519         return version_tuple(version) < version_tuple(limit)
1520     except ValueError:
1521         return not assume_new
1522
1523
1524 def ytdl_is_updateable():
1525     """ Returns if youtube-dl can be updated with -U """
1526     from zipimport import zipimporter
1527
1528     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1529
1530
1531 def args_to_str(args):
1532     # Get a short string representation for a subprocess command
1533     return ' '.join(shlex_quote(a) for a in args)