youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import calendar
   8 import codecs
   9 import contextlib
  10 import ctypes
  11 import datetime
  12 import email.utils
  13 import errno
  14 import functools
  15 import gzip
  16 import itertools
  17 import io
  18 import json
  19 import locale
  20 import math
  21 import operator
  22 import os
  23 import pipes
  24 import platform
  25 import re
  26 import ssl
  27 import socket
  28 import struct
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_basestring,
  38     compat_chr,
  39     compat_etree_fromstring,
  40     compat_html_entities,
  41     compat_http_client,
  42     compat_kwargs,
  43     compat_parse_qs,
  44     compat_socket_create_connection,
  45     compat_str,
  46     compat_urllib_error,
  47     compat_urllib_parse,
  48     compat_urllib_parse_urlparse,
  49     compat_urllib_request,
  50     compat_urlparse,
  51     shlex_quote,
  52 )
  53
  54
  55 # This is not clearly defined otherwise
  56 compiled_regex_type = type(re.compile(''))
  57
  58 std_headers = {
  59     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
  60     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  61     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  62     'Accept-Encoding': 'gzip, deflate',
  63     'Accept-Language': 'en-us,en;q=0.5',
  64 }
  65
  66
  67 NO_DEFAULT = object()
  68
  69 ENGLISH_MONTH_NAMES = [
  70     'January', 'February', 'March', 'April', 'May', 'June',
  71     'July', 'August', 'September', 'October', 'November', 'December']
  72
  73
  74 def preferredencoding():
  75     """Get preferred encoding.
  76
  77     Returns the best encoding scheme for the system, based on
  78     locale.getpreferredencoding() and some further tweaks.
  79     """
  80     try:
  81         pref = locale.getpreferredencoding()
  82         'TEST'.encode(pref)
  83     except Exception:
  84         pref = 'UTF-8'
  85
  86     return pref
  87
  88
  89 def write_json_file(obj, fn):
  90     """ Encode obj as JSON and write it to fn, atomically if possible """
  91
  92     fn = encodeFilename(fn)
  93     if sys.version_info < (3, 0) and sys.platform != 'win32':
  94         encoding = get_filesystem_encoding()
  95         # os.path.basename returns a bytes object, but NamedTemporaryFile
  96         # will fail if the filename contains non ascii characters unless we
  97         # use a unicode object
  98         path_basename = lambda f: os.path.basename(fn).decode(encoding)
  99         # the same for os.path.dirname
 100         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 101     else:
 102         path_basename = os.path.basename
 103         path_dirname = os.path.dirname
 104
 105     args = {
 106         'suffix': '.tmp',
 107         'prefix': path_basename(fn) + '.',
 108         'dir': path_dirname(fn),
 109         'delete': False,
 110     }
 111
 112     # In Python 2.x, json.dump expects a bytestream.
 113     # In Python 3.x, it writes to a character stream
 114     if sys.version_info < (3, 0):
 115         args['mode'] = 'wb'
 116     else:
 117         args.update({
 118             'mode': 'w',
 119             'encoding': 'utf-8',
 120         })
 121
 122     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 123
 124     try:
 125         with tf:
 126             json.dump(obj, tf)
 127         if sys.platform == 'win32':
 128             # Need to remove existing file on Windows, else os.rename raises
 129             # WindowsError or FileExistsError.
 130             try:
 131                 os.unlink(fn)
 132             except OSError:
 133                 pass
 134         os.rename(tf.name, fn)
 135     except Exception:
 136         try:
 137             os.remove(tf.name)
 138         except OSError:
 139             pass
 140         raise
 141
 142
 143 if sys.version_info >= (2, 7):
 144     def find_xpath_attr(node, xpath, key, val=None):
 145         """ Find the xpath xpath[@key=val] """
 146         assert re.match(r'^[a-zA-Z_-]+$', key)
 147         if val:
 148             assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 149         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 150         return node.find(expr)
 151 else:
 152     def find_xpath_attr(node, xpath, key, val=None):
 153         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 154         # .//node does not match if a node is a direct child of . !
 155         if isinstance(xpath, compat_str):
 156             xpath = xpath.encode('ascii')
 157
 158         for f in node.findall(xpath):
 159             if key not in f.attrib:
 160                 continue
 161             if val is None or f.attrib.get(key) == val:
 162                 return f
 163         return None
 164
 165 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 166 # the namespace parameter
 167
 168
 169 def xpath_with_ns(path, ns_map):
 170     components = [c.split(':') for c in path.split('/')]
 171     replaced = []
 172     for c in components:
 173         if len(c) == 1:
 174             replaced.append(c[0])
 175         else:
 176             ns, tag = c
 177             replaced.append('{%s}%s' % (ns_map[ns], tag))
 178     return '/'.join(replaced)
 179
 180
 181 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 182     def _find_xpath(xpath):
 183         if sys.version_info < (2, 7):  # Crazy 2.6
 184             xpath = xpath.encode('ascii')
 185         return node.find(xpath)
 186
 187     if isinstance(xpath, (str, compat_str)):
 188         n = _find_xpath(xpath)
 189     else:
 190         for xp in xpath:
 191             n = _find_xpath(xp)
 192             if n is not None:
 193                 break
 194
 195     if n is None:
 196         if default is not NO_DEFAULT:
 197             return default
 198         elif fatal:
 199             name = xpath if name is None else name
 200             raise ExtractorError('Could not find XML element %s' % name)
 201         else:
 202             return None
 203     return n
 204
 205
 206 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 207     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 208     if n is None or n == default:
 209         return n
 210     if n.text is None:
 211         if default is not NO_DEFAULT:
 212             return default
 213         elif fatal:
 214             name = xpath if name is None else name
 215             raise ExtractorError('Could not find XML element\'s text %s' % name)
 216         else:
 217             return None
 218     return n.text
 219
 220
 221 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 222     n = find_xpath_attr(node, xpath, key)
 223     if n is None:
 224         if default is not NO_DEFAULT:
 225             return default
 226         elif fatal:
 227             name = '%s[@%s]' % (xpath, key) if name is None else name
 228             raise ExtractorError('Could not find XML attribute %s' % name)
 229         else:
 230             return None
 231     return n.attrib[key]
 232
 233
 234 def get_element_by_id(id, html):
 235     """Return the content of the tag with the specified ID in the passed HTML document"""
 236     return get_element_by_attribute("id", id, html)
 237
 238
 239 def get_element_by_attribute(attribute, value, html):
 240     """Return the content of the tag with the specified attribute in the passed HTML document"""
 241
 242     m = re.search(r'''(?xs)
 243         <([a-zA-Z0-9:._-]+)
 244          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 245          \s+%s=['"]?%s['"]?
 246          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 247         \s*>
 248         (?P<content>.*?)
 249         </\1>
 250     ''' % (re.escape(attribute), re.escape(value)), html)
 251
 252     if not m:
 253         return None
 254     res = m.group('content')
 255
 256     if res.startswith('"') or res.startswith("'"):
 257         res = res[1:-1]
 258
 259     return unescapeHTML(res)
 260
 261
 262 def clean_html(html):
 263     """Clean an HTML snippet into a readable string"""
 264
 265     if html is None:  # Convenience for sanitizing descriptions etc.
 266         return html
 267
 268     # Newline vs <br />
 269     html = html.replace('\n', ' ')
 270     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 271     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 272     # Strip html tags
 273     html = re.sub('<.*?>', '', html)
 274     # Replace html entities
 275     html = unescapeHTML(html)
 276     return html.strip()
 277
 278
 279 def sanitize_open(filename, open_mode):
 280     """Try to open the given filename, and slightly tweak it if this fails.
 281
 282     Attempts to open the given filename. If this fails, it tries to change
 283     the filename slightly, step by step, until it's either able to open it
 284     or it fails and raises a final exception, like the standard open()
 285     function.
 286
 287     It returns the tuple (stream, definitive_file_name).
 288     """
 289     try:
 290         if filename == '-':
 291             if sys.platform == 'win32':
 292                 import msvcrt
 293                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 294             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 295         stream = open(encodeFilename(filename), open_mode)
 296         return (stream, filename)
 297     except (IOError, OSError) as err:
 298         if err.errno in (errno.EACCES,):
 299             raise
 300
 301         # In case of error, try to remove win32 forbidden chars
 302         alt_filename = sanitize_path(filename)
 303         if alt_filename == filename:
 304             raise
 305         else:
 306             # An exception here should be caught in the caller
 307             stream = open(encodeFilename(alt_filename), open_mode)
 308             return (stream, alt_filename)
 309
 310
 311 def timeconvert(timestr):
 312     """Convert RFC 2822 defined time string into system timestamp"""
 313     timestamp = None
 314     timetuple = email.utils.parsedate_tz(timestr)
 315     if timetuple is not None:
 316         timestamp = email.utils.mktime_tz(timetuple)
 317     return timestamp
 318
 319
 320 def sanitize_filename(s, restricted=False, is_id=False):
 321     """Sanitizes a string so it could be used as part of a filename.
 322     If restricted is set, use a stricter subset of allowed characters.
 323     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 324     """
 325     def replace_insane(char):
 326         if char == '?' or ord(char) < 32 or ord(char) == 127:
 327             return ''
 328         elif char == '"':
 329             return '' if restricted else '\''
 330         elif char == ':':
 331             return '_-' if restricted else ' -'
 332         elif char in '\\/|*<>':
 333             return '_'
 334         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 335             return '_'
 336         if restricted and ord(char) > 127:
 337             return '_'
 338         return char
 339
 340     # Handle timestamps
 341     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 342     result = ''.join(map(replace_insane, s))
 343     if not is_id:
 344         while '__' in result:
 345             result = result.replace('__', '_')
 346         result = result.strip('_')
 347         # Common case of "Foreign band name - English song title"
 348         if restricted and result.startswith('-_'):
 349             result = result[2:]
 350         if result.startswith('-'):
 351             result = '_' + result[len('-'):]
 352         result = result.lstrip('.')
 353         if not result:
 354             result = '_'
 355     return result
 356
 357
 358 def sanitize_path(s):
 359     """Sanitizes and normalizes path on Windows"""
 360     if sys.platform != 'win32':
 361         return s
 362     drive_or_unc, _ = os.path.splitdrive(s)
 363     if sys.version_info < (2, 7) and not drive_or_unc:
 364         drive_or_unc, _ = os.path.splitunc(s)
 365     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 366     if drive_or_unc:
 367         norm_path.pop(0)
 368     sanitized_path = [
 369         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 370         for path_part in norm_path]
 371     if drive_or_unc:
 372         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 373     return os.path.join(*sanitized_path)
 374
 375
 376 def orderedSet(iterable):
 377     """ Remove all duplicates from the input iterable """
 378     res = []
 379     for el in iterable:
 380         if el not in res:
 381             res.append(el)
 382     return res
 383
 384
 385 def _htmlentity_transform(entity):
 386     """Transforms an HTML entity to a character."""
 387     # Known non-numeric HTML entity
 388     if entity in compat_html_entities.name2codepoint:
 389         return compat_chr(compat_html_entities.name2codepoint[entity])
 390
 391     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 392     if mobj is not None:
 393         numstr = mobj.group(1)
 394         if numstr.startswith('x'):
 395             base = 16
 396             numstr = '0%s' % numstr
 397         else:
 398             base = 10
 399         return compat_chr(int(numstr, base))
 400
 401     # Unknown entity in name, return its literal representation
 402     return ('&%s;' % entity)
 403
 404
 405 def unescapeHTML(s):
 406     if s is None:
 407         return None
 408     assert type(s) == compat_str
 409
 410     return re.sub(
 411         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 412
 413
 414 def get_subprocess_encoding():
 415     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 416         # For subprocess calls, encode with locale encoding
 417         # Refer to http://stackoverflow.com/a/9951851/35070
 418         encoding = preferredencoding()
 419     else:
 420         encoding = sys.getfilesystemencoding()
 421     if encoding is None:
 422         encoding = 'utf-8'
 423     return encoding
 424
 425
 426 def encodeFilename(s, for_subprocess=False):
 427     """
 428     @param s The name of the file
 429     """
 430
 431     assert type(s) == compat_str
 432
 433     # Python 3 has a Unicode API
 434     if sys.version_info >= (3, 0):
 435         return s
 436
 437     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 438     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 439     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 440     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 441         return s
 442
 443     return s.encode(get_subprocess_encoding(), 'ignore')
 444
 445
 446 def decodeFilename(b, for_subprocess=False):
 447
 448     if sys.version_info >= (3, 0):
 449         return b
 450
 451     if not isinstance(b, bytes):
 452         return b
 453
 454     return b.decode(get_subprocess_encoding(), 'ignore')
 455
 456
 457 def encodeArgument(s):
 458     if not isinstance(s, compat_str):
 459         # Legacy code that uses byte strings
 460         # Uncomment the following line after fixing all post processors
 461         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 462         s = s.decode('ascii')
 463     return encodeFilename(s, True)
 464
 465
 466 def decodeArgument(b):
 467     return decodeFilename(b, True)
 468
 469
 470 def decodeOption(optval):
 471     if optval is None:
 472         return optval
 473     if isinstance(optval, bytes):
 474         optval = optval.decode(preferredencoding())
 475
 476     assert isinstance(optval, compat_str)
 477     return optval
 478
 479
 480 def formatSeconds(secs):
 481     if secs > 3600:
 482         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 483     elif secs > 60:
 484         return '%d:%02d' % (secs // 60, secs % 60)
 485     else:
 486         return '%d' % secs
 487
 488
 489 def make_HTTPS_handler(params, **kwargs):
 490     opts_no_check_certificate = params.get('nocheckcertificate', False)
 491     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 492         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 493         if opts_no_check_certificate:
 494             context.check_hostname = False
 495             context.verify_mode = ssl.CERT_NONE
 496         try:
 497             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 498         except TypeError:
 499             # Python 2.7.8
 500             # (create_default_context present but HTTPSHandler has no context=)
 501             pass
 502
 503     if sys.version_info < (3, 2):
 504         return YoutubeDLHTTPSHandler(params, **kwargs)
 505     else:  # Python < 3.4
 506         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 507         context.verify_mode = (ssl.CERT_NONE
 508                                if opts_no_check_certificate
 509                                else ssl.CERT_REQUIRED)
 510         context.set_default_verify_paths()
 511         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 512
 513
 514 def bug_reports_message():
 515     if ytdl_is_updateable():
 516         update_cmd = 'type  youtube-dl -U  to update'
 517     else:
 518         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 519     msg = '; please report this issue on https://yt-dl.org/bug .'
 520     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 521     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 522     return msg
 523
 524
 525 class ExtractorError(Exception):
 526     """Error during info extraction."""
 527
 528     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 529         """ tb, if given, is the original traceback (so that it can be printed out).
 530         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 531         """
 532
 533         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 534             expected = True
 535         if video_id is not None:
 536             msg = video_id + ': ' + msg
 537         if cause:
 538             msg += ' (caused by %r)' % cause
 539         if not expected:
 540             msg += bug_reports_message()
 541         super(ExtractorError, self).__init__(msg)
 542
 543         self.traceback = tb
 544         self.exc_info = sys.exc_info()  # preserve original exception
 545         self.cause = cause
 546         self.video_id = video_id
 547
 548     def format_traceback(self):
 549         if self.traceback is None:
 550             return None
 551         return ''.join(traceback.format_tb(self.traceback))
 552
 553
 554 class UnsupportedError(ExtractorError):
 555     def __init__(self, url):
 556         super(UnsupportedError, self).__init__(
 557             'Unsupported URL: %s' % url, expected=True)
 558         self.url = url
 559
 560
 561 class RegexNotFoundError(ExtractorError):
 562     """Error when a regex didn't match"""
 563     pass
 564
 565
 566 class DownloadError(Exception):
 567     """Download Error exception.
 568
 569     This exception may be thrown by FileDownloader objects if they are not
 570     configured to continue on errors. They will contain the appropriate
 571     error message.
 572     """
 573
 574     def __init__(self, msg, exc_info=None):
 575         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 576         super(DownloadError, self).__init__(msg)
 577         self.exc_info = exc_info
 578
 579
 580 class SameFileError(Exception):
 581     """Same File exception.
 582
 583     This exception will be thrown by FileDownloader objects if they detect
 584     multiple files would have to be downloaded to the same file on disk.
 585     """
 586     pass
 587
 588
 589 class PostProcessingError(Exception):
 590     """Post Processing exception.
 591
 592     This exception may be raised by PostProcessor's .run() method to
 593     indicate an error in the postprocessing task.
 594     """
 595
 596     def __init__(self, msg):
 597         self.msg = msg
 598
 599
 600 class MaxDownloadsReached(Exception):
 601     """ --max-downloads limit has been reached. """
 602     pass
 603
 604
 605 class UnavailableVideoError(Exception):
 606     """Unavailable Format exception.
 607
 608     This exception will be thrown when a video is requested
 609     in a format that is not available for that video.
 610     """
 611     pass
 612
 613
 614 class ContentTooShortError(Exception):
 615     """Content Too Short exception.
 616
 617     This exception may be raised by FileDownloader objects when a file they
 618     download is too small for what the server announced first, indicating
 619     the connection was probably interrupted.
 620     """
 621
 622     def __init__(self, downloaded, expected):
 623         # Both in bytes
 624         self.downloaded = downloaded
 625         self.expected = expected
 626
 627
 628 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 629     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 630     # expected HTTP responses to meet HTTP/1.0 or later (see also
 631     # https://github.com/rg3/youtube-dl/issues/6727)
 632     if sys.version_info < (3, 0):
 633         kwargs[b'strict'] = True
 634     hc = http_class(*args, **kwargs)
 635     source_address = ydl_handler._params.get('source_address')
 636     if source_address is not None:
 637         sa = (source_address, 0)
 638         if hasattr(hc, 'source_address'):  # Python 2.7+
 639             hc.source_address = sa
 640         else:  # Python 2.6
 641             def _hc_connect(self, *args, **kwargs):
 642                 sock = compat_socket_create_connection(
 643                     (self.host, self.port), self.timeout, sa)
 644                 if is_https:
 645                     self.sock = ssl.wrap_socket(
 646                         sock, self.key_file, self.cert_file,
 647                         ssl_version=ssl.PROTOCOL_TLSv1)
 648                 else:
 649                     self.sock = sock
 650             hc.connect = functools.partial(_hc_connect, hc)
 651
 652     return hc
 653
 654
 655 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 656     """Handler for HTTP requests and responses.
 657
 658     This class, when installed with an OpenerDirector, automatically adds
 659     the standard headers to every HTTP request and handles gzipped and
 660     deflated responses from web servers. If compression is to be avoided in
 661     a particular request, the original request in the program code only has
 662     to include the HTTP header "Youtubedl-No-Compression", which will be
 663     removed before making the real request.
 664
 665     Part of this code was copied from:
 666
 667     http://techknack.net/python-urllib2-handlers/
 668
 669     Andrew Rowls, the author of that code, agreed to release it to the
 670     public domain.
 671     """
 672
 673     def __init__(self, params, *args, **kwargs):
 674         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 675         self._params = params
 676
 677     def http_open(self, req):
 678         return self.do_open(functools.partial(
 679             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 680             req)
 681
 682     @staticmethod
 683     def deflate(data):
 684         try:
 685             return zlib.decompress(data, -zlib.MAX_WBITS)
 686         except zlib.error:
 687             return zlib.decompress(data)
 688
 689     @staticmethod
 690     def addinfourl_wrapper(stream, headers, url, code):
 691         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 692             return compat_urllib_request.addinfourl(stream, headers, url, code)
 693         ret = compat_urllib_request.addinfourl(stream, headers, url)
 694         ret.code = code
 695         return ret
 696
 697     def http_request(self, req):
 698         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 699         # always respected by websites, some tend to give out URLs with non percent-encoded
 700         # non-ASCII characters (see telemb.py, ard.py [#3412])
 701         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 702         # To work around aforementioned issue we will replace request's original URL with
 703         # percent-encoded one
 704         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 705         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 706         url = req.get_full_url()
 707         url_escaped = escape_url(url)
 708
 709         # Substitute URL if any change after escaping
 710         if url != url_escaped:
 711             req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
 712             new_req = req_type(
 713                 url_escaped, data=req.data, headers=req.headers,
 714                 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
 715             new_req.timeout = req.timeout
 716             req = new_req
 717
 718         for h, v in std_headers.items():
 719             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 720             # The dict keys are capitalized because of this bug by urllib
 721             if h.capitalize() not in req.headers:
 722                 req.add_header(h, v)
 723         if 'Youtubedl-no-compression' in req.headers:
 724             if 'Accept-encoding' in req.headers:
 725                 del req.headers['Accept-encoding']
 726             del req.headers['Youtubedl-no-compression']
 727
 728         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 729             # Python 2.6 is brain-dead when it comes to fragments
 730             req._Request__original = req._Request__original.partition('#')[0]
 731             req._Request__r_type = req._Request__r_type.partition('#')[0]
 732
 733         return req
 734
 735     def http_response(self, req, resp):
 736         old_resp = resp
 737         # gzip
 738         if resp.headers.get('Content-encoding', '') == 'gzip':
 739             content = resp.read()
 740             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 741             try:
 742                 uncompressed = io.BytesIO(gz.read())
 743             except IOError as original_ioerror:
 744                 # There may be junk add the end of the file
 745                 # See http://stackoverflow.com/q/4928560/35070 for details
 746                 for i in range(1, 1024):
 747                     try:
 748                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 749                         uncompressed = io.BytesIO(gz.read())
 750                     except IOError:
 751                         continue
 752                     break
 753                 else:
 754                     raise original_ioerror
 755             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 756             resp.msg = old_resp.msg
 757         # deflate
 758         if resp.headers.get('Content-encoding', '') == 'deflate':
 759             gz = io.BytesIO(self.deflate(resp.read()))
 760             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 761             resp.msg = old_resp.msg
 762         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 763         # https://github.com/rg3/youtube-dl/issues/6457).
 764         if 300 <= resp.code < 400:
 765             location = resp.headers.get('Location')
 766             if location:
 767                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 768                 if sys.version_info >= (3, 0):
 769                     location = location.encode('iso-8859-1').decode('utf-8')
 770                 location_escaped = escape_url(location)
 771                 if location != location_escaped:
 772                     del resp.headers['Location']
 773                     resp.headers['Location'] = location_escaped
 774         return resp
 775
 776     https_request = http_request
 777     https_response = http_response
 778
 779
 780 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 781     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 782         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 783         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 784         self._params = params
 785
 786     def https_open(self, req):
 787         kwargs = {}
 788         if hasattr(self, '_context'):  # python > 2.6
 789             kwargs['context'] = self._context
 790         if hasattr(self, '_check_hostname'):  # python 3.x
 791             kwargs['check_hostname'] = self._check_hostname
 792         return self.do_open(functools.partial(
 793             _create_http_connection, self, self._https_conn_class, True),
 794             req, **kwargs)
 795
 796
 797 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 798     def __init__(self, cookiejar=None):
 799         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 800
 801     def http_response(self, request, response):
 802         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 803         # characters in Set-Cookie HTTP header of last response (see
 804         # https://github.com/rg3/youtube-dl/issues/6769).
 805         # In order to at least prevent crashing we will percent encode Set-Cookie
 806         # header before HTTPCookieProcessor starts processing it.
 807         # if sys.version_info < (3, 0) and response.headers:
 808         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 809         #         set_cookie = response.headers.get(set_cookie_header)
 810         #         if set_cookie:
 811         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 812         #             if set_cookie != set_cookie_escaped:
 813         #                 del response.headers[set_cookie_header]
 814         #                 response.headers[set_cookie_header] = set_cookie_escaped
 815         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 816
 817     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 818     https_response = http_response
 819
 820
 821 def parse_iso8601(date_str, delimiter='T', timezone=None):
 822     """ Return a UNIX timestamp from the given date """
 823
 824     if date_str is None:
 825         return None
 826
 827     date_str = re.sub(r'\.[0-9]+', '', date_str)
 828
 829     if timezone is None:
 830         m = re.search(
 831             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 832             date_str)
 833         if not m:
 834             timezone = datetime.timedelta()
 835         else:
 836             date_str = date_str[:-len(m.group(0))]
 837             if not m.group('sign'):
 838                 timezone = datetime.timedelta()
 839             else:
 840                 sign = 1 if m.group('sign') == '+' else -1
 841                 timezone = datetime.timedelta(
 842                     hours=sign * int(m.group('hours')),
 843                     minutes=sign * int(m.group('minutes')))
 844     try:
 845         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 846         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 847         return calendar.timegm(dt.timetuple())
 848     except ValueError:
 849         pass
 850
 851
 852 def unified_strdate(date_str, day_first=True):
 853     """Return a string with the date in the format YYYYMMDD"""
 854
 855     if date_str is None:
 856         return None
 857     upload_date = None
 858     # Replace commas
 859     date_str = date_str.replace(',', ' ')
 860     # %z (UTC offset) is only supported in python>=3.2
 861     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 862         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 863     # Remove AM/PM + timezone
 864     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 865
 866     format_expressions = [
 867         '%d %B %Y',
 868         '%d %b %Y',
 869         '%B %d %Y',
 870         '%b %d %Y',
 871         '%b %dst %Y %I:%M%p',
 872         '%b %dnd %Y %I:%M%p',
 873         '%b %dth %Y %I:%M%p',
 874         '%Y %m %d',
 875         '%Y-%m-%d',
 876         '%Y/%m/%d',
 877         '%Y/%m/%d %H:%M:%S',
 878         '%Y-%m-%d %H:%M:%S',
 879         '%Y-%m-%d %H:%M:%S.%f',
 880         '%d.%m.%Y %H:%M',
 881         '%d.%m.%Y %H.%M',
 882         '%Y-%m-%dT%H:%M:%SZ',
 883         '%Y-%m-%dT%H:%M:%S.%fZ',
 884         '%Y-%m-%dT%H:%M:%S.%f0Z',
 885         '%Y-%m-%dT%H:%M:%S',
 886         '%Y-%m-%dT%H:%M:%S.%f',
 887         '%Y-%m-%dT%H:%M',
 888     ]
 889     if day_first:
 890         format_expressions.extend([
 891             '%d-%m-%Y',
 892             '%d.%m.%Y',
 893             '%d/%m/%Y',
 894             '%d/%m/%y',
 895             '%d/%m/%Y %H:%M:%S',
 896         ])
 897     else:
 898         format_expressions.extend([
 899             '%m-%d-%Y',
 900             '%m.%d.%Y',
 901             '%m/%d/%Y',
 902             '%m/%d/%y',
 903             '%m/%d/%Y %H:%M:%S',
 904         ])
 905     for expression in format_expressions:
 906         try:
 907             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 908         except ValueError:
 909             pass
 910     if upload_date is None:
 911         timetuple = email.utils.parsedate_tz(date_str)
 912         if timetuple:
 913             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 914     return compat_str(upload_date)
 915
 916
 917 def determine_ext(url, default_ext='unknown_video'):
 918     if url is None:
 919         return default_ext
 920     guess = url.partition('?')[0].rpartition('.')[2]
 921     if re.match(r'^[A-Za-z0-9]+$', guess):
 922         return guess
 923     else:
 924         return default_ext
 925
 926
 927 def subtitles_filename(filename, sub_lang, sub_format):
 928     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 929
 930
 931 def date_from_str(date_str):
 932     """
 933     Return a datetime object from a string in the format YYYYMMDD or
 934     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 935     today = datetime.date.today()
 936     if date_str in ('now', 'today'):
 937         return today
 938     if date_str == 'yesterday':
 939         return today - datetime.timedelta(days=1)
 940     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 941     if match is not None:
 942         sign = match.group('sign')
 943         time = int(match.group('time'))
 944         if sign == '-':
 945             time = -time
 946         unit = match.group('unit')
 947         # A bad aproximation?
 948         if unit == 'month':
 949             unit = 'day'
 950             time *= 30
 951         elif unit == 'year':
 952             unit = 'day'
 953             time *= 365
 954         unit += 's'
 955         delta = datetime.timedelta(**{unit: time})
 956         return today + delta
 957     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 958
 959
 960 def hyphenate_date(date_str):
 961     """
 962     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 963     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 964     if match is not None:
 965         return '-'.join(match.groups())
 966     else:
 967         return date_str
 968
 969
 970 class DateRange(object):
 971     """Represents a time interval between two dates"""
 972
 973     def __init__(self, start=None, end=None):
 974         """start and end must be strings in the format accepted by date"""
 975         if start is not None:
 976             self.start = date_from_str(start)
 977         else:
 978             self.start = datetime.datetime.min.date()
 979         if end is not None:
 980             self.end = date_from_str(end)
 981         else:
 982             self.end = datetime.datetime.max.date()
 983         if self.start > self.end:
 984             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 985
 986     @classmethod
 987     def day(cls, day):
 988         """Returns a range that only contains the given day"""
 989         return cls(day, day)
 990
 991     def __contains__(self, date):
 992         """Check if the date is in the range"""
 993         if not isinstance(date, datetime.date):
 994             date = date_from_str(date)
 995         return self.start <= date <= self.end
 996
 997     def __str__(self):
 998         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
 999
1000
1001 def platform_name():
1002     """ Returns the platform name as a compat_str """
1003     res = platform.platform()
1004     if isinstance(res, bytes):
1005         res = res.decode(preferredencoding())
1006
1007     assert isinstance(res, compat_str)
1008     return res
1009
1010
1011 def _windows_write_string(s, out):
1012     """ Returns True if the string was written using special methods,
1013     False if it has yet to be written out."""
1014     # Adapted from http://stackoverflow.com/a/3259271/35070
1015
1016     import ctypes
1017     import ctypes.wintypes
1018
1019     WIN_OUTPUT_IDS = {
1020         1: -11,
1021         2: -12,
1022     }
1023
1024     try:
1025         fileno = out.fileno()
1026     except AttributeError:
1027         # If the output stream doesn't have a fileno, it's virtual
1028         return False
1029     except io.UnsupportedOperation:
1030         # Some strange Windows pseudo files?
1031         return False
1032     if fileno not in WIN_OUTPUT_IDS:
1033         return False
1034
1035     GetStdHandle = ctypes.WINFUNCTYPE(
1036         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1037         (b"GetStdHandle", ctypes.windll.kernel32))
1038     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1039
1040     WriteConsoleW = ctypes.WINFUNCTYPE(
1041         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1042         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1043         ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
1044     written = ctypes.wintypes.DWORD(0)
1045
1046     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
1047     FILE_TYPE_CHAR = 0x0002
1048     FILE_TYPE_REMOTE = 0x8000
1049     GetConsoleMode = ctypes.WINFUNCTYPE(
1050         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1051         ctypes.POINTER(ctypes.wintypes.DWORD))(
1052         (b"GetConsoleMode", ctypes.windll.kernel32))
1053     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1054
1055     def not_a_console(handle):
1056         if handle == INVALID_HANDLE_VALUE or handle is None:
1057             return True
1058         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1059                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1060
1061     if not_a_console(h):
1062         return False
1063
1064     def next_nonbmp_pos(s):
1065         try:
1066             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1067         except StopIteration:
1068             return len(s)
1069
1070     while s:
1071         count = min(next_nonbmp_pos(s), 1024)
1072
1073         ret = WriteConsoleW(
1074             h, s, count if count else 2, ctypes.byref(written), None)
1075         if ret == 0:
1076             raise OSError('Failed to write string')
1077         if not count:  # We just wrote a non-BMP character
1078             assert written.value == 2
1079             s = s[1:]
1080         else:
1081             assert written.value > 0
1082             s = s[written.value:]
1083     return True
1084
1085
1086 def write_string(s, out=None, encoding=None):
1087     if out is None:
1088         out = sys.stderr
1089     assert type(s) == compat_str
1090
1091     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1092         if _windows_write_string(s, out):
1093             return
1094
1095     if ('b' in getattr(out, 'mode', '') or
1096             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1097         byt = s.encode(encoding or preferredencoding(), 'ignore')
1098         out.write(byt)
1099     elif hasattr(out, 'buffer'):
1100         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1101         byt = s.encode(enc, 'ignore')
1102         out.buffer.write(byt)
1103     else:
1104         out.write(s)
1105     out.flush()
1106
1107
1108 def bytes_to_intlist(bs):
1109     if not bs:
1110         return []
1111     if isinstance(bs[0], int):  # Python 3
1112         return list(bs)
1113     else:
1114         return [ord(c) for c in bs]
1115
1116
1117 def intlist_to_bytes(xs):
1118     if not xs:
1119         return b''
1120     return struct_pack('%dB' % len(xs), *xs)
1121
1122
1123 # Cross-platform file locking
1124 if sys.platform == 'win32':
1125     import ctypes.wintypes
1126     import msvcrt
1127
1128     class OVERLAPPED(ctypes.Structure):
1129         _fields_ = [
1130             ('Internal', ctypes.wintypes.LPVOID),
1131             ('InternalHigh', ctypes.wintypes.LPVOID),
1132             ('Offset', ctypes.wintypes.DWORD),
1133             ('OffsetHigh', ctypes.wintypes.DWORD),
1134             ('hEvent', ctypes.wintypes.HANDLE),
1135         ]
1136
1137     kernel32 = ctypes.windll.kernel32
1138     LockFileEx = kernel32.LockFileEx
1139     LockFileEx.argtypes = [
1140         ctypes.wintypes.HANDLE,     # hFile
1141         ctypes.wintypes.DWORD,      # dwFlags
1142         ctypes.wintypes.DWORD,      # dwReserved
1143         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1144         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1145         ctypes.POINTER(OVERLAPPED)  # Overlapped
1146     ]
1147     LockFileEx.restype = ctypes.wintypes.BOOL
1148     UnlockFileEx = kernel32.UnlockFileEx
1149     UnlockFileEx.argtypes = [
1150         ctypes.wintypes.HANDLE,     # hFile
1151         ctypes.wintypes.DWORD,      # dwReserved
1152         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1153         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1154         ctypes.POINTER(OVERLAPPED)  # Overlapped
1155     ]
1156     UnlockFileEx.restype = ctypes.wintypes.BOOL
1157     whole_low = 0xffffffff
1158     whole_high = 0x7fffffff
1159
1160     def _lock_file(f, exclusive):
1161         overlapped = OVERLAPPED()
1162         overlapped.Offset = 0
1163         overlapped.OffsetHigh = 0
1164         overlapped.hEvent = 0
1165         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1166         handle = msvcrt.get_osfhandle(f.fileno())
1167         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1168                           whole_low, whole_high, f._lock_file_overlapped_p):
1169             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1170
1171     def _unlock_file(f):
1172         assert f._lock_file_overlapped_p
1173         handle = msvcrt.get_osfhandle(f.fileno())
1174         if not UnlockFileEx(handle, 0,
1175                             whole_low, whole_high, f._lock_file_overlapped_p):
1176             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1177
1178 else:
1179     import fcntl
1180
1181     def _lock_file(f, exclusive):
1182         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1183
1184     def _unlock_file(f):
1185         fcntl.flock(f, fcntl.LOCK_UN)
1186
1187
1188 class locked_file(object):
1189     def __init__(self, filename, mode, encoding=None):
1190         assert mode in ['r', 'a', 'w']
1191         self.f = io.open(filename, mode, encoding=encoding)
1192         self.mode = mode
1193
1194     def __enter__(self):
1195         exclusive = self.mode != 'r'
1196         try:
1197             _lock_file(self.f, exclusive)
1198         except IOError:
1199             self.f.close()
1200             raise
1201         return self
1202
1203     def __exit__(self, etype, value, traceback):
1204         try:
1205             _unlock_file(self.f)
1206         finally:
1207             self.f.close()
1208
1209     def __iter__(self):
1210         return iter(self.f)
1211
1212     def write(self, *args):
1213         return self.f.write(*args)
1214
1215     def read(self, *args):
1216         return self.f.read(*args)
1217
1218
1219 def get_filesystem_encoding():
1220     encoding = sys.getfilesystemencoding()
1221     return encoding if encoding is not None else 'utf-8'
1222
1223
1224 def shell_quote(args):
1225     quoted_args = []
1226     encoding = get_filesystem_encoding()
1227     for a in args:
1228         if isinstance(a, bytes):
1229             # We may get a filename encoded with 'encodeFilename'
1230             a = a.decode(encoding)
1231         quoted_args.append(pipes.quote(a))
1232     return ' '.join(quoted_args)
1233
1234
1235 def smuggle_url(url, data):
1236     """ Pass additional data in a URL for internal use. """
1237
1238     sdata = compat_urllib_parse.urlencode(
1239         {'__youtubedl_smuggle': json.dumps(data)})
1240     return url + '#' + sdata
1241
1242
1243 def unsmuggle_url(smug_url, default=None):
1244     if '#__youtubedl_smuggle' not in smug_url:
1245         return smug_url, default
1246     url, _, sdata = smug_url.rpartition('#')
1247     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1248     data = json.loads(jsond)
1249     return url, data
1250
1251
1252 def format_bytes(bytes):
1253     if bytes is None:
1254         return 'N/A'
1255     if type(bytes) is str:
1256         bytes = float(bytes)
1257     if bytes == 0.0:
1258         exponent = 0
1259     else:
1260         exponent = int(math.log(bytes, 1024.0))
1261     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1262     converted = float(bytes) / float(1024 ** exponent)
1263     return '%.2f%s' % (converted, suffix)
1264
1265
1266 def parse_filesize(s):
1267     if s is None:
1268         return None
1269
1270     # The lower-case forms are of course incorrect and inofficial,
1271     # but we support those too
1272     _UNIT_TABLE = {
1273         'B': 1,
1274         'b': 1,
1275         'KiB': 1024,
1276         'KB': 1000,
1277         'kB': 1024,
1278         'Kb': 1000,
1279         'MiB': 1024 ** 2,
1280         'MB': 1000 ** 2,
1281         'mB': 1024 ** 2,
1282         'Mb': 1000 ** 2,
1283         'GiB': 1024 ** 3,
1284         'GB': 1000 ** 3,
1285         'gB': 1024 ** 3,
1286         'Gb': 1000 ** 3,
1287         'TiB': 1024 ** 4,
1288         'TB': 1000 ** 4,
1289         'tB': 1024 ** 4,
1290         'Tb': 1000 ** 4,
1291         'PiB': 1024 ** 5,
1292         'PB': 1000 ** 5,
1293         'pB': 1024 ** 5,
1294         'Pb': 1000 ** 5,
1295         'EiB': 1024 ** 6,
1296         'EB': 1000 ** 6,
1297         'eB': 1024 ** 6,
1298         'Eb': 1000 ** 6,
1299         'ZiB': 1024 ** 7,
1300         'ZB': 1000 ** 7,
1301         'zB': 1024 ** 7,
1302         'Zb': 1000 ** 7,
1303         'YiB': 1024 ** 8,
1304         'YB': 1000 ** 8,
1305         'yB': 1024 ** 8,
1306         'Yb': 1000 ** 8,
1307     }
1308
1309     units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1310     m = re.match(
1311         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1312     if not m:
1313         return None
1314
1315     num_str = m.group('num').replace(',', '.')
1316     mult = _UNIT_TABLE[m.group('unit')]
1317     return int(float(num_str) * mult)
1318
1319
1320 def month_by_name(name):
1321     """ Return the number of a month by (locale-independently) English name """
1322
1323     try:
1324         return ENGLISH_MONTH_NAMES.index(name) + 1
1325     except ValueError:
1326         return None
1327
1328
1329 def month_by_abbreviation(abbrev):
1330     """ Return the number of a month by (locale-independently) English
1331         abbreviations """
1332
1333     try:
1334         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1335     except ValueError:
1336         return None
1337
1338
1339 def fix_xml_ampersands(xml_str):
1340     """Replace all the '&' by '&amp;' in XML"""
1341     return re.sub(
1342         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1343         '&amp;',
1344         xml_str)
1345
1346
1347 def setproctitle(title):
1348     assert isinstance(title, compat_str)
1349     try:
1350         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1351     except OSError:
1352         return
1353     title_bytes = title.encode('utf-8')
1354     buf = ctypes.create_string_buffer(len(title_bytes))
1355     buf.value = title_bytes
1356     try:
1357         libc.prctl(15, buf, 0, 0, 0)
1358     except AttributeError:
1359         return  # Strange libc, just skip this
1360
1361
1362 def remove_start(s, start):
1363     if s.startswith(start):
1364         return s[len(start):]
1365     return s
1366
1367
1368 def remove_end(s, end):
1369     if s.endswith(end):
1370         return s[:-len(end)]
1371     return s
1372
1373
1374 def url_basename(url):
1375     path = compat_urlparse.urlparse(url).path
1376     return path.strip('/').split('/')[-1]
1377
1378
1379 class HEADRequest(compat_urllib_request.Request):
1380     def get_method(self):
1381         return "HEAD"
1382
1383
1384 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1385     if get_attr:
1386         if v is not None:
1387             v = getattr(v, get_attr, None)
1388     if v == '':
1389         v = None
1390     if v is None:
1391         return default
1392     try:
1393         return int(v) * invscale // scale
1394     except ValueError:
1395         return default
1396
1397
1398 def str_or_none(v, default=None):
1399     return default if v is None else compat_str(v)
1400
1401
1402 def str_to_int(int_str):
1403     """ A more relaxed version of int_or_none """
1404     if int_str is None:
1405         return None
1406     int_str = re.sub(r'[,\.\+]', '', int_str)
1407     return int(int_str)
1408
1409
1410 def float_or_none(v, scale=1, invscale=1, default=None):
1411     if v is None:
1412         return default
1413     try:
1414         return float(v) * invscale / scale
1415     except ValueError:
1416         return default
1417
1418
1419 def parse_duration(s):
1420     if not isinstance(s, compat_basestring):
1421         return None
1422
1423     s = s.strip()
1424
1425     m = re.match(
1426         r'''(?ix)(?:P?T)?
1427         (?:
1428             (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1429             (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1430
1431             \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1432             (?:
1433                 (?:
1434                     (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1435                     (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1436                 )?
1437                 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1438             )?
1439             (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1440         )$''', s)
1441     if not m:
1442         return None
1443     res = 0
1444     if m.group('only_mins'):
1445         return float_or_none(m.group('only_mins'), invscale=60)
1446     if m.group('only_hours'):
1447         return float_or_none(m.group('only_hours'), invscale=60 * 60)
1448     if m.group('secs'):
1449         res += int(m.group('secs'))
1450     if m.group('mins_reversed'):
1451         res += int(m.group('mins_reversed')) * 60
1452     if m.group('mins'):
1453         res += int(m.group('mins')) * 60
1454     if m.group('hours'):
1455         res += int(m.group('hours')) * 60 * 60
1456     if m.group('hours_reversed'):
1457         res += int(m.group('hours_reversed')) * 60 * 60
1458     if m.group('days'):
1459         res += int(m.group('days')) * 24 * 60 * 60
1460     if m.group('ms'):
1461         res += float(m.group('ms'))
1462     return res
1463
1464
1465 def prepend_extension(filename, ext, expected_real_ext=None):
1466     name, real_ext = os.path.splitext(filename)
1467     return (
1468         '{0}.{1}{2}'.format(name, ext, real_ext)
1469         if not expected_real_ext or real_ext[1:] == expected_real_ext
1470         else '{0}.{1}'.format(filename, ext))
1471
1472
1473 def replace_extension(filename, ext, expected_real_ext=None):
1474     name, real_ext = os.path.splitext(filename)
1475     return '{0}.{1}'.format(
1476         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1477         ext)
1478
1479
1480 def check_executable(exe, args=[]):
1481     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1482     args can be a list of arguments for a short output (like -version) """
1483     try:
1484         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1485     except OSError:
1486         return False
1487     return exe
1488
1489
1490 def get_exe_version(exe, args=['--version'],
1491                     version_re=None, unrecognized='present'):
1492     """ Returns the version of the specified executable,
1493     or False if the executable is not present """
1494     try:
1495         out, _ = subprocess.Popen(
1496             [encodeArgument(exe)] + args,
1497             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1498     except OSError:
1499         return False
1500     if isinstance(out, bytes):  # Python 2.x
1501         out = out.decode('ascii', 'ignore')
1502     return detect_exe_version(out, version_re, unrecognized)
1503
1504
1505 def detect_exe_version(output, version_re=None, unrecognized='present'):
1506     assert isinstance(output, compat_str)
1507     if version_re is None:
1508         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1509     m = re.search(version_re, output)
1510     if m:
1511         return m.group(1)
1512     else:
1513         return unrecognized
1514
1515
1516 class PagedList(object):
1517     def __len__(self):
1518         # This is only useful for tests
1519         return len(self.getslice())
1520
1521
1522 class OnDemandPagedList(PagedList):
1523     def __init__(self, pagefunc, pagesize):
1524         self._pagefunc = pagefunc
1525         self._pagesize = pagesize
1526
1527     def getslice(self, start=0, end=None):
1528         res = []
1529         for pagenum in itertools.count(start // self._pagesize):
1530             firstid = pagenum * self._pagesize
1531             nextfirstid = pagenum * self._pagesize + self._pagesize
1532             if start >= nextfirstid:
1533                 continue
1534
1535             page_results = list(self._pagefunc(pagenum))
1536
1537             startv = (
1538                 start % self._pagesize
1539                 if firstid <= start < nextfirstid
1540                 else 0)
1541
1542             endv = (
1543                 ((end - 1) % self._pagesize) + 1
1544                 if (end is not None and firstid <= end <= nextfirstid)
1545                 else None)
1546
1547             if startv != 0 or endv is not None:
1548                 page_results = page_results[startv:endv]
1549             res.extend(page_results)
1550
1551             # A little optimization - if current page is not "full", ie. does
1552             # not contain page_size videos then we can assume that this page
1553             # is the last one - there are no more ids on further pages -
1554             # i.e. no need to query again.
1555             if len(page_results) + startv < self._pagesize:
1556                 break
1557
1558             # If we got the whole page, but the next page is not interesting,
1559             # break out early as well
1560             if end == nextfirstid:
1561                 break
1562         return res
1563
1564
1565 class InAdvancePagedList(PagedList):
1566     def __init__(self, pagefunc, pagecount, pagesize):
1567         self._pagefunc = pagefunc
1568         self._pagecount = pagecount
1569         self._pagesize = pagesize
1570
1571     def getslice(self, start=0, end=None):
1572         res = []
1573         start_page = start // self._pagesize
1574         end_page = (
1575             self._pagecount if end is None else (end // self._pagesize + 1))
1576         skip_elems = start - start_page * self._pagesize
1577         only_more = None if end is None else end - start
1578         for pagenum in range(start_page, end_page):
1579             page = list(self._pagefunc(pagenum))
1580             if skip_elems:
1581                 page = page[skip_elems:]
1582                 skip_elems = None
1583             if only_more is not None:
1584                 if len(page) < only_more:
1585                     only_more -= len(page)
1586                 else:
1587                     page = page[:only_more]
1588                     res.extend(page)
1589                     break
1590             res.extend(page)
1591         return res
1592
1593
1594 def uppercase_escape(s):
1595     unicode_escape = codecs.getdecoder('unicode_escape')
1596     return re.sub(
1597         r'\\U[0-9a-fA-F]{8}',
1598         lambda m: unicode_escape(m.group(0))[0],
1599         s)
1600
1601
1602 def lowercase_escape(s):
1603     unicode_escape = codecs.getdecoder('unicode_escape')
1604     return re.sub(
1605         r'\\u[0-9a-fA-F]{4}',
1606         lambda m: unicode_escape(m.group(0))[0],
1607         s)
1608
1609
1610 def escape_rfc3986(s):
1611     """Escape non-ASCII characters as suggested by RFC 3986"""
1612     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1613         s = s.encode('utf-8')
1614     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1615
1616
1617 def escape_url(url):
1618     """Escape URL as suggested by RFC 3986"""
1619     url_parsed = compat_urllib_parse_urlparse(url)
1620     return url_parsed._replace(
1621         path=escape_rfc3986(url_parsed.path),
1622         params=escape_rfc3986(url_parsed.params),
1623         query=escape_rfc3986(url_parsed.query),
1624         fragment=escape_rfc3986(url_parsed.fragment)
1625     ).geturl()
1626
1627 try:
1628     struct.pack('!I', 0)
1629 except TypeError:
1630     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1631     def struct_pack(spec, *args):
1632         if isinstance(spec, compat_str):
1633             spec = spec.encode('ascii')
1634         return struct.pack(spec, *args)
1635
1636     def struct_unpack(spec, *args):
1637         if isinstance(spec, compat_str):
1638             spec = spec.encode('ascii')
1639         return struct.unpack(spec, *args)
1640 else:
1641     struct_pack = struct.pack
1642     struct_unpack = struct.unpack
1643
1644
1645 def read_batch_urls(batch_fd):
1646     def fixup(url):
1647         if not isinstance(url, compat_str):
1648             url = url.decode('utf-8', 'replace')
1649         BOM_UTF8 = '\xef\xbb\xbf'
1650         if url.startswith(BOM_UTF8):
1651             url = url[len(BOM_UTF8):]
1652         url = url.strip()
1653         if url.startswith(('#', ';', ']')):
1654             return False
1655         return url
1656
1657     with contextlib.closing(batch_fd) as fd:
1658         return [url for url in map(fixup, fd) if url]
1659
1660
1661 def urlencode_postdata(*args, **kargs):
1662     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1663
1664
1665 def encode_dict(d, encoding='utf-8'):
1666     return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
1667
1668
1669 US_RATINGS = {
1670     'G': 0,
1671     'PG': 10,
1672     'PG-13': 13,
1673     'R': 16,
1674     'NC': 18,
1675 }
1676
1677
1678 def parse_age_limit(s):
1679     if s is None:
1680         return None
1681     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1682     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1683
1684
1685 def strip_jsonp(code):
1686     return re.sub(
1687         r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1688
1689
1690 def js_to_json(code):
1691     def fix_kv(m):
1692         v = m.group(0)
1693         if v in ('true', 'false', 'null'):
1694             return v
1695         if v.startswith('"'):
1696             v = re.sub(r"\\'", "'", v[1:-1])
1697         elif v.startswith("'"):
1698             v = v[1:-1]
1699             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1700                 '\\\\': '\\\\',
1701                 "\\'": "'",
1702                 '"': '\\"',
1703             }[m.group(0)], v)
1704         return '"%s"' % v
1705
1706     res = re.sub(r'''(?x)
1707         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1708         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1709         [a-zA-Z_][.a-zA-Z_0-9]*
1710         ''', fix_kv, code)
1711     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1712     return res
1713
1714
1715 def qualities(quality_ids):
1716     """ Get a numeric quality value out of a list of possible values """
1717     def q(qid):
1718         try:
1719             return quality_ids.index(qid)
1720         except ValueError:
1721             return -1
1722     return q
1723
1724
1725 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1726
1727
1728 def limit_length(s, length):
1729     """ Add ellipses to overly long strings """
1730     if s is None:
1731         return None
1732     ELLIPSES = '...'
1733     if len(s) > length:
1734         return s[:length - len(ELLIPSES)] + ELLIPSES
1735     return s
1736
1737
1738 def version_tuple(v):
1739     return tuple(int(e) for e in re.split(r'[-.]', v))
1740
1741
1742 def is_outdated_version(version, limit, assume_new=True):
1743     if not version:
1744         return not assume_new
1745     try:
1746         return version_tuple(version) < version_tuple(limit)
1747     except ValueError:
1748         return not assume_new
1749
1750
1751 def ytdl_is_updateable():
1752     """ Returns if youtube-dl can be updated with -U """
1753     from zipimport import zipimporter
1754
1755     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1756
1757
1758 def args_to_str(args):
1759     # Get a short string representation for a subprocess command
1760     return ' '.join(shlex_quote(a) for a in args)
1761
1762
1763 def mimetype2ext(mt):
1764     _, _, res = mt.rpartition('/')
1765
1766     return {
1767         'x-ms-wmv': 'wmv',
1768         'x-mp4-fragmented': 'mp4',
1769         'ttml+xml': 'ttml',
1770     }.get(res, res)
1771
1772
1773 def urlhandle_detect_ext(url_handle):
1774     try:
1775         url_handle.headers
1776         getheader = lambda h: url_handle.headers[h]
1777     except AttributeError:  # Python < 3
1778         getheader = url_handle.info().getheader
1779
1780     cd = getheader('Content-Disposition')
1781     if cd:
1782         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1783         if m:
1784             e = determine_ext(m.group('filename'), default_ext=None)
1785             if e:
1786                 return e
1787
1788     return mimetype2ext(getheader('Content-Type'))
1789
1790
1791 def encode_data_uri(data, mime_type):
1792     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1793
1794
1795 def age_restricted(content_limit, age_limit):
1796     """ Returns True iff the content should be blocked """
1797
1798     if age_limit is None:  # No limit set
1799         return False
1800     if content_limit is None:
1801         return False  # Content available for everyone
1802     return age_limit < content_limit
1803
1804
1805 def is_html(first_bytes):
1806     """ Detect whether a file contains HTML by examining its first bytes. """
1807
1808     BOMS = [
1809         (b'\xef\xbb\xbf', 'utf-8'),
1810         (b'\x00\x00\xfe\xff', 'utf-32-be'),
1811         (b'\xff\xfe\x00\x00', 'utf-32-le'),
1812         (b'\xff\xfe', 'utf-16-le'),
1813         (b'\xfe\xff', 'utf-16-be'),
1814     ]
1815     for bom, enc in BOMS:
1816         if first_bytes.startswith(bom):
1817             s = first_bytes[len(bom):].decode(enc, 'replace')
1818             break
1819     else:
1820         s = first_bytes.decode('utf-8', 'replace')
1821
1822     return re.match(r'^\s*<', s)
1823
1824
1825 def determine_protocol(info_dict):
1826     protocol = info_dict.get('protocol')
1827     if protocol is not None:
1828         return protocol
1829
1830     url = info_dict['url']
1831     if url.startswith('rtmp'):
1832         return 'rtmp'
1833     elif url.startswith('mms'):
1834         return 'mms'
1835     elif url.startswith('rtsp'):
1836         return 'rtsp'
1837
1838     ext = determine_ext(url)
1839     if ext == 'm3u8':
1840         return 'm3u8'
1841     elif ext == 'f4m':
1842         return 'f4m'
1843
1844     return compat_urllib_parse_urlparse(url).scheme
1845
1846
1847 def render_table(header_row, data):
1848     """ Render a list of rows, each as a list of values """
1849     table = [header_row] + data
1850     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1851     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1852     return '\n'.join(format_str % tuple(row) for row in table)
1853
1854
1855 def _match_one(filter_part, dct):
1856     COMPARISON_OPERATORS = {
1857         '<': operator.lt,
1858         '<=': operator.le,
1859         '>': operator.gt,
1860         '>=': operator.ge,
1861         '=': operator.eq,
1862         '!=': operator.ne,
1863     }
1864     operator_rex = re.compile(r'''(?x)\s*
1865         (?P<key>[a-z_]+)
1866         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1867         (?:
1868             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1869             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1870         )
1871         \s*$
1872         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1873     m = operator_rex.search(filter_part)
1874     if m:
1875         op = COMPARISON_OPERATORS[m.group('op')]
1876         if m.group('strval') is not None:
1877             if m.group('op') not in ('=', '!='):
1878                 raise ValueError(
1879                     'Operator %s does not support string values!' % m.group('op'))
1880             comparison_value = m.group('strval')
1881         else:
1882             try:
1883                 comparison_value = int(m.group('intval'))
1884             except ValueError:
1885                 comparison_value = parse_filesize(m.group('intval'))
1886                 if comparison_value is None:
1887                     comparison_value = parse_filesize(m.group('intval') + 'B')
1888                 if comparison_value is None:
1889                     raise ValueError(
1890                         'Invalid integer value %r in filter part %r' % (
1891                             m.group('intval'), filter_part))
1892         actual_value = dct.get(m.group('key'))
1893         if actual_value is None:
1894             return m.group('none_inclusive')
1895         return op(actual_value, comparison_value)
1896
1897     UNARY_OPERATORS = {
1898         '': lambda v: v is not None,
1899         '!': lambda v: v is None,
1900     }
1901     operator_rex = re.compile(r'''(?x)\s*
1902         (?P<op>%s)\s*(?P<key>[a-z_]+)
1903         \s*$
1904         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1905     m = operator_rex.search(filter_part)
1906     if m:
1907         op = UNARY_OPERATORS[m.group('op')]
1908         actual_value = dct.get(m.group('key'))
1909         return op(actual_value)
1910
1911     raise ValueError('Invalid filter part %r' % filter_part)
1912
1913
1914 def match_str(filter_str, dct):
1915     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1916
1917     return all(
1918         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1919
1920
1921 def match_filter_func(filter_str):
1922     def _match_func(info_dict):
1923         if match_str(filter_str, info_dict):
1924             return None
1925         else:
1926             video_title = info_dict.get('title', info_dict.get('id', 'video'))
1927             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1928     return _match_func
1929
1930
1931 def parse_dfxp_time_expr(time_expr):
1932     if not time_expr:
1933         return 0.0
1934
1935     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1936     if mobj:
1937         return float(mobj.group('time_offset'))
1938
1939     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1940     if mobj:
1941         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1942
1943
1944 def srt_subtitles_timecode(seconds):
1945     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
1946
1947
1948 def dfxp2srt(dfxp_data):
1949     _x = functools.partial(xpath_with_ns, ns_map={
1950         'ttml': 'http://www.w3.org/ns/ttml',
1951         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
1952     })
1953
1954     def parse_node(node):
1955         str_or_empty = functools.partial(str_or_none, default='')
1956
1957         out = str_or_empty(node.text)
1958
1959         for child in node:
1960             if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
1961                 out += '\n' + str_or_empty(child.tail)
1962             elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
1963                 out += str_or_empty(parse_node(child))
1964             else:
1965                 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1966
1967         return out
1968
1969     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
1970     out = []
1971     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1972
1973     if not paras:
1974         raise ValueError('Invalid dfxp/TTML subtitle')
1975
1976     for para, index in zip(paras, itertools.count(1)):
1977         begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1978         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1979         if not end_time:
1980             end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
1981         out.append('%d\n%s --> %s\n%s\n\n' % (
1982             index,
1983             srt_subtitles_timecode(begin_time),
1984             srt_subtitles_timecode(end_time),
1985             parse_node(para)))
1986
1987     return ''.join(out)
1988
1989
1990 def cli_option(params, command_option, param):
1991     param = params.get(param)
1992     return [command_option, param] if param is not None else []
1993
1994
1995 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
1996     param = params.get(param)
1997     assert isinstance(param, bool)
1998     if separator:
1999         return [command_option + separator + (true_value if param else false_value)]
2000     return [command_option, true_value if param else false_value]
2001
2002
2003 def cli_valueless_option(params, command_option, param, expected_value=True):
2004     param = params.get(param)
2005     return [command_option] if param == expected_value else []
2006
2007
2008 def cli_configuration_args(params, param, default=[]):
2009     ex_args = params.get(param)
2010     if ex_args is None:
2011         return default
2012     assert isinstance(ex_args, list)
2013     return ex_args
2014
2015
2016 class ISO639Utils(object):
2017     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2018     _lang_map = {
2019         'aa': 'aar',
2020         'ab': 'abk',
2021         'ae': 'ave',
2022         'af': 'afr',
2023         'ak': 'aka',
2024         'am': 'amh',
2025         'an': 'arg',
2026         'ar': 'ara',
2027         'as': 'asm',
2028         'av': 'ava',
2029         'ay': 'aym',
2030         'az': 'aze',
2031         'ba': 'bak',
2032         'be': 'bel',
2033         'bg': 'bul',
2034         'bh': 'bih',
2035         'bi': 'bis',
2036         'bm': 'bam',
2037         'bn': 'ben',
2038         'bo': 'bod',
2039         'br': 'bre',
2040         'bs': 'bos',
2041         'ca': 'cat',
2042         'ce': 'che',
2043         'ch': 'cha',
2044         'co': 'cos',
2045         'cr': 'cre',
2046         'cs': 'ces',
2047         'cu': 'chu',
2048         'cv': 'chv',
2049         'cy': 'cym',
2050         'da': 'dan',
2051         'de': 'deu',
2052         'dv': 'div',
2053         'dz': 'dzo',
2054         'ee': 'ewe',
2055         'el': 'ell',
2056         'en': 'eng',
2057         'eo': 'epo',
2058         'es': 'spa',
2059         'et': 'est',
2060         'eu': 'eus',
2061         'fa': 'fas',
2062         'ff': 'ful',
2063         'fi': 'fin',
2064         'fj': 'fij',
2065         'fo': 'fao',
2066         'fr': 'fra',
2067         'fy': 'fry',
2068         'ga': 'gle',
2069         'gd': 'gla',
2070         'gl': 'glg',
2071         'gn': 'grn',
2072         'gu': 'guj',
2073         'gv': 'glv',
2074         'ha': 'hau',
2075         'he': 'heb',
2076         'hi': 'hin',
2077         'ho': 'hmo',
2078         'hr': 'hrv',
2079         'ht': 'hat',
2080         'hu': 'hun',
2081         'hy': 'hye',
2082         'hz': 'her',
2083         'ia': 'ina',
2084         'id': 'ind',
2085         'ie': 'ile',
2086         'ig': 'ibo',
2087         'ii': 'iii',
2088         'ik': 'ipk',
2089         'io': 'ido',
2090         'is': 'isl',
2091         'it': 'ita',
2092         'iu': 'iku',
2093         'ja': 'jpn',
2094         'jv': 'jav',
2095         'ka': 'kat',
2096         'kg': 'kon',
2097         'ki': 'kik',
2098         'kj': 'kua',
2099         'kk': 'kaz',
2100         'kl': 'kal',
2101         'km': 'khm',
2102         'kn': 'kan',
2103         'ko': 'kor',
2104         'kr': 'kau',
2105         'ks': 'kas',
2106         'ku': 'kur',
2107         'kv': 'kom',
2108         'kw': 'cor',
2109         'ky': 'kir',
2110         'la': 'lat',
2111         'lb': 'ltz',
2112         'lg': 'lug',
2113         'li': 'lim',
2114         'ln': 'lin',
2115         'lo': 'lao',
2116         'lt': 'lit',
2117         'lu': 'lub',
2118         'lv': 'lav',
2119         'mg': 'mlg',
2120         'mh': 'mah',
2121         'mi': 'mri',
2122         'mk': 'mkd',
2123         'ml': 'mal',
2124         'mn': 'mon',
2125         'mr': 'mar',
2126         'ms': 'msa',
2127         'mt': 'mlt',
2128         'my': 'mya',
2129         'na': 'nau',
2130         'nb': 'nob',
2131         'nd': 'nde',
2132         'ne': 'nep',
2133         'ng': 'ndo',
2134         'nl': 'nld',
2135         'nn': 'nno',
2136         'no': 'nor',
2137         'nr': 'nbl',
2138         'nv': 'nav',
2139         'ny': 'nya',
2140         'oc': 'oci',
2141         'oj': 'oji',
2142         'om': 'orm',
2143         'or': 'ori',
2144         'os': 'oss',
2145         'pa': 'pan',
2146         'pi': 'pli',
2147         'pl': 'pol',
2148         'ps': 'pus',
2149         'pt': 'por',
2150         'qu': 'que',
2151         'rm': 'roh',
2152         'rn': 'run',
2153         'ro': 'ron',
2154         'ru': 'rus',
2155         'rw': 'kin',
2156         'sa': 'san',
2157         'sc': 'srd',
2158         'sd': 'snd',
2159         'se': 'sme',
2160         'sg': 'sag',
2161         'si': 'sin',
2162         'sk': 'slk',
2163         'sl': 'slv',
2164         'sm': 'smo',
2165         'sn': 'sna',
2166         'so': 'som',
2167         'sq': 'sqi',
2168         'sr': 'srp',
2169         'ss': 'ssw',
2170         'st': 'sot',
2171         'su': 'sun',
2172         'sv': 'swe',
2173         'sw': 'swa',
2174         'ta': 'tam',
2175         'te': 'tel',
2176         'tg': 'tgk',
2177         'th': 'tha',
2178         'ti': 'tir',
2179         'tk': 'tuk',
2180         'tl': 'tgl',
2181         'tn': 'tsn',
2182         'to': 'ton',
2183         'tr': 'tur',
2184         'ts': 'tso',
2185         'tt': 'tat',
2186         'tw': 'twi',
2187         'ty': 'tah',
2188         'ug': 'uig',
2189         'uk': 'ukr',
2190         'ur': 'urd',
2191         'uz': 'uzb',
2192         've': 'ven',
2193         'vi': 'vie',
2194         'vo': 'vol',
2195         'wa': 'wln',
2196         'wo': 'wol',
2197         'xh': 'xho',
2198         'yi': 'yid',
2199         'yo': 'yor',
2200         'za': 'zha',
2201         'zh': 'zho',
2202         'zu': 'zul',
2203     }
2204
2205     @classmethod
2206     def short2long(cls, code):
2207         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2208         return cls._lang_map.get(code[:2])
2209
2210     @classmethod
2211     def long2short(cls, code):
2212         """Convert language code from ISO 639-2/T to ISO 639-1"""
2213         for short_name, long_name in cls._lang_map.items():
2214             if long_name == code:
2215                 return short_name
2216
2217
2218 class ISO3166Utils(object):
2219     # From http://data.okfn.org/data/core/country-list
2220     _country_map = {
2221         'AF': 'Afghanistan',
2222         'AX': 'Åland Islands',
2223         'AL': 'Albania',
2224         'DZ': 'Algeria',
2225         'AS': 'American Samoa',
2226         'AD': 'Andorra',
2227         'AO': 'Angola',
2228         'AI': 'Anguilla',
2229         'AQ': 'Antarctica',
2230         'AG': 'Antigua and Barbuda',
2231         'AR': 'Argentina',
2232         'AM': 'Armenia',
2233         'AW': 'Aruba',
2234         'AU': 'Australia',
2235         'AT': 'Austria',
2236         'AZ': 'Azerbaijan',
2237         'BS': 'Bahamas',
2238         'BH': 'Bahrain',
2239         'BD': 'Bangladesh',
2240         'BB': 'Barbados',
2241         'BY': 'Belarus',
2242         'BE': 'Belgium',
2243         'BZ': 'Belize',
2244         'BJ': 'Benin',
2245         'BM': 'Bermuda',
2246         'BT': 'Bhutan',
2247         'BO': 'Bolivia, Plurinational State of',
2248         'BQ': 'Bonaire, Sint Eustatius and Saba',
2249         'BA': 'Bosnia and Herzegovina',
2250         'BW': 'Botswana',
2251         'BV': 'Bouvet Island',
2252         'BR': 'Brazil',
2253         'IO': 'British Indian Ocean Territory',
2254         'BN': 'Brunei Darussalam',
2255         'BG': 'Bulgaria',
2256         'BF': 'Burkina Faso',
2257         'BI': 'Burundi',
2258         'KH': 'Cambodia',
2259         'CM': 'Cameroon',
2260         'CA': 'Canada',
2261         'CV': 'Cape Verde',
2262         'KY': 'Cayman Islands',
2263         'CF': 'Central African Republic',
2264         'TD': 'Chad',
2265         'CL': 'Chile',
2266         'CN': 'China',
2267         'CX': 'Christmas Island',
2268         'CC': 'Cocos (Keeling) Islands',
2269         'CO': 'Colombia',
2270         'KM': 'Comoros',
2271         'CG': 'Congo',
2272         'CD': 'Congo, the Democratic Republic of the',
2273         'CK': 'Cook Islands',
2274         'CR': 'Costa Rica',
2275         'CI': 'Côte d\'Ivoire',
2276         'HR': 'Croatia',
2277         'CU': 'Cuba',
2278         'CW': 'Curaçao',
2279         'CY': 'Cyprus',
2280         'CZ': 'Czech Republic',
2281         'DK': 'Denmark',
2282         'DJ': 'Djibouti',
2283         'DM': 'Dominica',
2284         'DO': 'Dominican Republic',
2285         'EC': 'Ecuador',
2286         'EG': 'Egypt',
2287         'SV': 'El Salvador',
2288         'GQ': 'Equatorial Guinea',
2289         'ER': 'Eritrea',
2290         'EE': 'Estonia',
2291         'ET': 'Ethiopia',
2292         'FK': 'Falkland Islands (Malvinas)',
2293         'FO': 'Faroe Islands',
2294         'FJ': 'Fiji',
2295         'FI': 'Finland',
2296         'FR': 'France',
2297         'GF': 'French Guiana',
2298         'PF': 'French Polynesia',
2299         'TF': 'French Southern Territories',
2300         'GA': 'Gabon',
2301         'GM': 'Gambia',
2302         'GE': 'Georgia',
2303         'DE': 'Germany',
2304         'GH': 'Ghana',
2305         'GI': 'Gibraltar',
2306         'GR': 'Greece',
2307         'GL': 'Greenland',
2308         'GD': 'Grenada',
2309         'GP': 'Guadeloupe',
2310         'GU': 'Guam',
2311         'GT': 'Guatemala',
2312         'GG': 'Guernsey',
2313         'GN': 'Guinea',
2314         'GW': 'Guinea-Bissau',
2315         'GY': 'Guyana',
2316         'HT': 'Haiti',
2317         'HM': 'Heard Island and McDonald Islands',
2318         'VA': 'Holy See (Vatican City State)',
2319         'HN': 'Honduras',
2320         'HK': 'Hong Kong',
2321         'HU': 'Hungary',
2322         'IS': 'Iceland',
2323         'IN': 'India',
2324         'ID': 'Indonesia',
2325         'IR': 'Iran, Islamic Republic of',
2326         'IQ': 'Iraq',
2327         'IE': 'Ireland',
2328         'IM': 'Isle of Man',
2329         'IL': 'Israel',
2330         'IT': 'Italy',
2331         'JM': 'Jamaica',
2332         'JP': 'Japan',
2333         'JE': 'Jersey',
2334         'JO': 'Jordan',
2335         'KZ': 'Kazakhstan',
2336         'KE': 'Kenya',
2337         'KI': 'Kiribati',
2338         'KP': 'Korea, Democratic People\'s Republic of',
2339         'KR': 'Korea, Republic of',
2340         'KW': 'Kuwait',
2341         'KG': 'Kyrgyzstan',
2342         'LA': 'Lao People\'s Democratic Republic',
2343         'LV': 'Latvia',
2344         'LB': 'Lebanon',
2345         'LS': 'Lesotho',
2346         'LR': 'Liberia',
2347         'LY': 'Libya',
2348         'LI': 'Liechtenstein',
2349         'LT': 'Lithuania',
2350         'LU': 'Luxembourg',
2351         'MO': 'Macao',
2352         'MK': 'Macedonia, the Former Yugoslav Republic of',
2353         'MG': 'Madagascar',
2354         'MW': 'Malawi',
2355         'MY': 'Malaysia',
2356         'MV': 'Maldives',
2357         'ML': 'Mali',
2358         'MT': 'Malta',
2359         'MH': 'Marshall Islands',
2360         'MQ': 'Martinique',
2361         'MR': 'Mauritania',
2362         'MU': 'Mauritius',
2363         'YT': 'Mayotte',
2364         'MX': 'Mexico',
2365         'FM': 'Micronesia, Federated States of',
2366         'MD': 'Moldova, Republic of',
2367         'MC': 'Monaco',
2368         'MN': 'Mongolia',
2369         'ME': 'Montenegro',
2370         'MS': 'Montserrat',
2371         'MA': 'Morocco',
2372         'MZ': 'Mozambique',
2373         'MM': 'Myanmar',
2374         'NA': 'Namibia',
2375         'NR': 'Nauru',
2376         'NP': 'Nepal',
2377         'NL': 'Netherlands',
2378         'NC': 'New Caledonia',
2379         'NZ': 'New Zealand',
2380         'NI': 'Nicaragua',
2381         'NE': 'Niger',
2382         'NG': 'Nigeria',
2383         'NU': 'Niue',
2384         'NF': 'Norfolk Island',
2385         'MP': 'Northern Mariana Islands',
2386         'NO': 'Norway',
2387         'OM': 'Oman',
2388         'PK': 'Pakistan',
2389         'PW': 'Palau',
2390         'PS': 'Palestine, State of',
2391         'PA': 'Panama',
2392         'PG': 'Papua New Guinea',
2393         'PY': 'Paraguay',
2394         'PE': 'Peru',
2395         'PH': 'Philippines',
2396         'PN': 'Pitcairn',
2397         'PL': 'Poland',
2398         'PT': 'Portugal',
2399         'PR': 'Puerto Rico',
2400         'QA': 'Qatar',
2401         'RE': 'Réunion',
2402         'RO': 'Romania',
2403         'RU': 'Russian Federation',
2404         'RW': 'Rwanda',
2405         'BL': 'Saint Barthélemy',
2406         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2407         'KN': 'Saint Kitts and Nevis',
2408         'LC': 'Saint Lucia',
2409         'MF': 'Saint Martin (French part)',
2410         'PM': 'Saint Pierre and Miquelon',
2411         'VC': 'Saint Vincent and the Grenadines',
2412         'WS': 'Samoa',
2413         'SM': 'San Marino',
2414         'ST': 'Sao Tome and Principe',
2415         'SA': 'Saudi Arabia',
2416         'SN': 'Senegal',
2417         'RS': 'Serbia',
2418         'SC': 'Seychelles',
2419         'SL': 'Sierra Leone',
2420         'SG': 'Singapore',
2421         'SX': 'Sint Maarten (Dutch part)',
2422         'SK': 'Slovakia',
2423         'SI': 'Slovenia',
2424         'SB': 'Solomon Islands',
2425         'SO': 'Somalia',
2426         'ZA': 'South Africa',
2427         'GS': 'South Georgia and the South Sandwich Islands',
2428         'SS': 'South Sudan',
2429         'ES': 'Spain',
2430         'LK': 'Sri Lanka',
2431         'SD': 'Sudan',
2432         'SR': 'Suriname',
2433         'SJ': 'Svalbard and Jan Mayen',
2434         'SZ': 'Swaziland',
2435         'SE': 'Sweden',
2436         'CH': 'Switzerland',
2437         'SY': 'Syrian Arab Republic',
2438         'TW': 'Taiwan, Province of China',
2439         'TJ': 'Tajikistan',
2440         'TZ': 'Tanzania, United Republic of',
2441         'TH': 'Thailand',
2442         'TL': 'Timor-Leste',
2443         'TG': 'Togo',
2444         'TK': 'Tokelau',
2445         'TO': 'Tonga',
2446         'TT': 'Trinidad and Tobago',
2447         'TN': 'Tunisia',
2448         'TR': 'Turkey',
2449         'TM': 'Turkmenistan',
2450         'TC': 'Turks and Caicos Islands',
2451         'TV': 'Tuvalu',
2452         'UG': 'Uganda',
2453         'UA': 'Ukraine',
2454         'AE': 'United Arab Emirates',
2455         'GB': 'United Kingdom',
2456         'US': 'United States',
2457         'UM': 'United States Minor Outlying Islands',
2458         'UY': 'Uruguay',
2459         'UZ': 'Uzbekistan',
2460         'VU': 'Vanuatu',
2461         'VE': 'Venezuela, Bolivarian Republic of',
2462         'VN': 'Viet Nam',
2463         'VG': 'Virgin Islands, British',
2464         'VI': 'Virgin Islands, U.S.',
2465         'WF': 'Wallis and Futuna',
2466         'EH': 'Western Sahara',
2467         'YE': 'Yemen',
2468         'ZM': 'Zambia',
2469         'ZW': 'Zimbabwe',
2470     }
2471
2472     @classmethod
2473     def short2full(cls, code):
2474         """Convert an ISO 3166-2 country code to the corresponding full name"""
2475         return cls._country_map.get(code.upper())
2476
2477
2478 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2479     def __init__(self, proxies=None):
2480         # Set default handlers
2481         for type in ('http', 'https'):
2482             setattr(self, '%s_open' % type,
2483                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2484                         meth(r, proxy, type))
2485         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2486
2487     def proxy_open(self, req, proxy, type):
2488         req_proxy = req.headers.get('Ytdl-request-proxy')
2489         if req_proxy is not None:
2490             proxy = req_proxy
2491             del req.headers['Ytdl-request-proxy']
2492
2493         if proxy == '__noproxy__':
2494             return None  # No Proxy
2495         return compat_urllib_request.ProxyHandler.proxy_open(
2496             self, req, proxy, type)