youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import io
  18 import itertools
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import socket
  28 import ssl
  29 import subprocess
  30 import sys
  31 import tempfile
  32 import traceback
  33 import xml.etree.ElementTree
  34 import zlib
  35
  36 from .compat import (
  37     compat_HTMLParser,
  38     compat_basestring,
  39     compat_chr,
  40     compat_etree_fromstring,
  41     compat_html_entities,
  42     compat_html_entities_html5,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_shlex_quote,
  47     compat_socket_create_connection,
  48     compat_str,
  49     compat_struct_pack,
  50     compat_urllib_error,
  51     compat_urllib_parse,
  52     compat_urllib_parse_urlencode,
  53     compat_urllib_parse_urlparse,
  54     compat_urllib_parse_unquote_plus,
  55     compat_urllib_request,
  56     compat_urlparse,
  57     compat_xpath,
  58 )
  59
  60 from .socks import (
  61     ProxyType,
  62     sockssocket,
  63 )
  64
  65
  66 def register_socks_protocols():
  67     # "Register" SOCKS protocols
  68     # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
  69     # URLs with protocols not in urlparse.uses_netloc are not handled correctly
  70     for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
  71         if scheme not in compat_urlparse.uses_netloc:
  72             compat_urlparse.uses_netloc.append(scheme)
  73
  74
  75 # This is not clearly defined otherwise
  76 compiled_regex_type = type(re.compile(''))
  77
  78 std_headers = {
  79     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
  80     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  81     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  82     'Accept-Encoding': 'gzip, deflate',
  83     'Accept-Language': 'en-us,en;q=0.5',
  84 }
  85
  86
  87 NO_DEFAULT = object()
  88
  89 ENGLISH_MONTH_NAMES = [
  90     'January', 'February', 'March', 'April', 'May', 'June',
  91     'July', 'August', 'September', 'October', 'November', 'December']
  92
  93 KNOWN_EXTENSIONS = (
  94     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  95     'flv', 'f4v', 'f4a', 'f4b',
  96     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  97     'mkv', 'mka', 'mk3d',
  98     'avi', 'divx',
  99     'mov',
 100     'asf', 'wmv', 'wma',
 101     '3gp', '3g2',
 102     'mp3',
 103     'flac',
 104     'ape',
 105     'wav',
 106     'f4f', 'f4m', 'm3u8', 'smil')
 107
 108 # needed for sanitizing filenames in restricted mode
 109 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
 110                         itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
 111                                         'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 112
 113 DATE_FORMATS = (
 114     '%d %B %Y',
 115     '%d %b %Y',
 116     '%B %d %Y',
 117     '%b %d %Y',
 118     '%b %dst %Y %I:%M',
 119     '%b %dnd %Y %I:%M',
 120     '%b %dth %Y %I:%M',
 121     '%Y %m %d',
 122     '%Y-%m-%d',
 123     '%Y/%m/%d',
 124     '%Y/%m/%d %H:%M:%S',
 125     '%Y-%m-%d %H:%M:%S',
 126     '%Y-%m-%d %H:%M:%S.%f',
 127     '%d.%m.%Y %H:%M',
 128     '%d.%m.%Y %H.%M',
 129     '%Y-%m-%dT%H:%M:%SZ',
 130     '%Y-%m-%dT%H:%M:%S.%fZ',
 131     '%Y-%m-%dT%H:%M:%S.%f0Z',
 132     '%Y-%m-%dT%H:%M:%S',
 133     '%Y-%m-%dT%H:%M:%S.%f',
 134     '%Y-%m-%dT%H:%M',
 135 )
 136
 137 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
 138 DATE_FORMATS_DAY_FIRST.extend([
 139     '%d-%m-%Y',
 140     '%d.%m.%Y',
 141     '%d.%m.%y',
 142     '%d/%m/%Y',
 143     '%d/%m/%y',
 144     '%d/%m/%Y %H:%M:%S',
 145 ])
 146
 147 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
 148 DATE_FORMATS_MONTH_FIRST.extend([
 149     '%m-%d-%Y',
 150     '%m.%d.%Y',
 151     '%m/%d/%Y',
 152     '%m/%d/%y',
 153     '%m/%d/%Y %H:%M:%S',
 154 ])
 155
 156
 157 def preferredencoding():
 158     """Get preferred encoding.
 159
 160     Returns the best encoding scheme for the system, based on
 161     locale.getpreferredencoding() and some further tweaks.
 162     """
 163     try:
 164         pref = locale.getpreferredencoding()
 165         'TEST'.encode(pref)
 166     except Exception:
 167         pref = 'UTF-8'
 168
 169     return pref
 170
 171
 172 def write_json_file(obj, fn):
 173     """ Encode obj as JSON and write it to fn, atomically if possible """
 174
 175     fn = encodeFilename(fn)
 176     if sys.version_info < (3, 0) and sys.platform != 'win32':
 177         encoding = get_filesystem_encoding()
 178         # os.path.basename returns a bytes object, but NamedTemporaryFile
 179         # will fail if the filename contains non ascii characters unless we
 180         # use a unicode object
 181         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 182         # the same for os.path.dirname
 183         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 184     else:
 185         path_basename = os.path.basename
 186         path_dirname = os.path.dirname
 187
 188     args = {
 189         'suffix': '.tmp',
 190         'prefix': path_basename(fn) + '.',
 191         'dir': path_dirname(fn),
 192         'delete': False,
 193     }
 194
 195     # In Python 2.x, json.dump expects a bytestream.
 196     # In Python 3.x, it writes to a character stream
 197     if sys.version_info < (3, 0):
 198         args['mode'] = 'wb'
 199     else:
 200         args.update({
 201             'mode': 'w',
 202             'encoding': 'utf-8',
 203         })
 204
 205     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 206
 207     try:
 208         with tf:
 209             json.dump(obj, tf)
 210         if sys.platform == 'win32':
 211             # Need to remove existing file on Windows, else os.rename raises
 212             # WindowsError or FileExistsError.
 213             try:
 214                 os.unlink(fn)
 215             except OSError:
 216                 pass
 217         os.rename(tf.name, fn)
 218     except Exception:
 219         try:
 220             os.remove(tf.name)
 221         except OSError:
 222             pass
 223         raise
 224
 225
 226 if sys.version_info >= (2, 7):
 227     def find_xpath_attr(node, xpath, key, val=None):
 228         """ Find the xpath xpath[@key=val] """
 229         assert re.match(r'^[a-zA-Z_-]+$', key)
 230         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 231         return node.find(expr)
 232 else:
 233     def find_xpath_attr(node, xpath, key, val=None):
 234         for f in node.findall(compat_xpath(xpath)):
 235             if key not in f.attrib:
 236                 continue
 237             if val is None or f.attrib.get(key) == val:
 238                 return f
 239         return None
 240
 241 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 242 # the namespace parameter
 243
 244
 245 def xpath_with_ns(path, ns_map):
 246     components = [c.split(':') for c in path.split('/')]
 247     replaced = []
 248     for c in components:
 249         if len(c) == 1:
 250             replaced.append(c[0])
 251         else:
 252             ns, tag = c
 253             replaced.append('{%s}%s' % (ns_map[ns], tag))
 254     return '/'.join(replaced)
 255
 256
 257 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 258     def _find_xpath(xpath):
 259         return node.find(compat_xpath(xpath))
 260
 261     if isinstance(xpath, (str, compat_str)):
 262         n = _find_xpath(xpath)
 263     else:
 264         for xp in xpath:
 265             n = _find_xpath(xp)
 266             if n is not None:
 267                 break
 268
 269     if n is None:
 270         if default is not NO_DEFAULT:
 271             return default
 272         elif fatal:
 273             name = xpath if name is None else name
 274             raise ExtractorError('Could not find XML element %s' % name)
 275         else:
 276             return None
 277     return n
 278
 279
 280 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 281     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 282     if n is None or n == default:
 283         return n
 284     if n.text is None:
 285         if default is not NO_DEFAULT:
 286             return default
 287         elif fatal:
 288             name = xpath if name is None else name
 289             raise ExtractorError('Could not find XML element\'s text %s' % name)
 290         else:
 291             return None
 292     return n.text
 293
 294
 295 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 296     n = find_xpath_attr(node, xpath, key)
 297     if n is None:
 298         if default is not NO_DEFAULT:
 299             return default
 300         elif fatal:
 301             name = '%s[@%s]' % (xpath, key) if name is None else name
 302             raise ExtractorError('Could not find XML attribute %s' % name)
 303         else:
 304             return None
 305     return n.attrib[key]
 306
 307
 308 def get_element_by_id(id, html):
 309     """Return the content of the tag with the specified ID in the passed HTML document"""
 310     return get_element_by_attribute('id', id, html)
 311
 312
 313 def get_element_by_class(class_name, html):
 314     return get_element_by_attribute(
 315         'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
 316         html, escape_value=False)
 317
 318
 319 def get_element_by_attribute(attribute, value, html, escape_value=True):
 320     """Return the content of the tag with the specified attribute in the passed HTML document"""
 321
 322     value = re.escape(value) if escape_value else value
 323
 324     m = re.search(r'''(?xs)
 325         <([a-zA-Z0-9:._-]+)
 326          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 327          \s+%s=['"]?%s['"]?
 328          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
 329         \s*>
 330         (?P<content>.*?)
 331         </\1>
 332     ''' % (re.escape(attribute), value), html)
 333
 334     if not m:
 335         return None
 336     res = m.group('content')
 337
 338     if res.startswith('"') or res.startswith("'"):
 339         res = res[1:-1]
 340
 341     return unescapeHTML(res)
 342
 343
 344 class HTMLAttributeParser(compat_HTMLParser):
 345     """Trivial HTML parser to gather the attributes for a single element"""
 346     def __init__(self):
 347         self.attrs = {}
 348         compat_HTMLParser.__init__(self)
 349
 350     def handle_starttag(self, tag, attrs):
 351         self.attrs = dict(attrs)
 352
 353
 354 def extract_attributes(html_element):
 355     """Given a string for an HTML element such as
 356     <el
 357          a="foo" B="bar" c="&98;az" d=boz
 358          empty= noval entity="&amp;"
 359          sq='"' dq="'"
 360     >
 361     Decode and return a dictionary of attributes.
 362     {
 363         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 364         'empty': '', 'noval': None, 'entity': '&',
 365         'sq': '"', 'dq': '\''
 366     }.
 367     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 368     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 369     """
 370     parser = HTMLAttributeParser()
 371     parser.feed(html_element)
 372     parser.close()
 373     return parser.attrs
 374
 375
 376 def clean_html(html):
 377     """Clean an HTML snippet into a readable string"""
 378
 379     if html is None:  # Convenience for sanitizing descriptions etc.
 380         return html
 381
 382     # Newline vs <br />
 383     html = html.replace('\n', ' ')
 384     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 385     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 386     # Strip html tags
 387     html = re.sub('<.*?>', '', html)
 388     # Replace html entities
 389     html = unescapeHTML(html)
 390     return html.strip()
 391
 392
 393 def sanitize_open(filename, open_mode):
 394     """Try to open the given filename, and slightly tweak it if this fails.
 395
 396     Attempts to open the given filename. If this fails, it tries to change
 397     the filename slightly, step by step, until it's either able to open it
 398     or it fails and raises a final exception, like the standard open()
 399     function.
 400
 401     It returns the tuple (stream, definitive_file_name).
 402     """
 403     try:
 404         if filename == '-':
 405             if sys.platform == 'win32':
 406                 import msvcrt
 407                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 408             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 409         stream = open(encodeFilename(filename), open_mode)
 410         return (stream, filename)
 411     except (IOError, OSError) as err:
 412         if err.errno in (errno.EACCES,):
 413             raise
 414
 415         # In case of error, try to remove win32 forbidden chars
 416         alt_filename = sanitize_path(filename)
 417         if alt_filename == filename:
 418             raise
 419         else:
 420             # An exception here should be caught in the caller
 421             stream = open(encodeFilename(alt_filename), open_mode)
 422             return (stream, alt_filename)
 423
 424
 425 def timeconvert(timestr):
 426     """Convert RFC 2822 defined time string into system timestamp"""
 427     timestamp = None
 428     timetuple = email.utils.parsedate_tz(timestr)
 429     if timetuple is not None:
 430         timestamp = email.utils.mktime_tz(timetuple)
 431     return timestamp
 432
 433
 434 def sanitize_filename(s, restricted=False, is_id=False):
 435     """Sanitizes a string so it could be used as part of a filename.
 436     If restricted is set, use a stricter subset of allowed characters.
 437     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 438     """
 439     def replace_insane(char):
 440         if restricted and char in ACCENT_CHARS:
 441             return ACCENT_CHARS[char]
 442         if char == '?' or ord(char) < 32 or ord(char) == 127:
 443             return ''
 444         elif char == '"':
 445             return '' if restricted else '\''
 446         elif char == ':':
 447             return '_-' if restricted else ' -'
 448         elif char in '\\/|*<>':
 449             return '_'
 450         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 451             return '_'
 452         if restricted and ord(char) > 127:
 453             return '_'
 454         return char
 455
 456     # Handle timestamps
 457     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 458     result = ''.join(map(replace_insane, s))
 459     if not is_id:
 460         while '__' in result:
 461             result = result.replace('__', '_')
 462         result = result.strip('_')
 463         # Common case of "Foreign band name - English song title"
 464         if restricted and result.startswith('-_'):
 465             result = result[2:]
 466         if result.startswith('-'):
 467             result = '_' + result[len('-'):]
 468         result = result.lstrip('.')
 469         if not result:
 470             result = '_'
 471     return result
 472
 473
 474 def sanitize_path(s):
 475     """Sanitizes and normalizes path on Windows"""
 476     if sys.platform != 'win32':
 477         return s
 478     drive_or_unc, _ = os.path.splitdrive(s)
 479     if sys.version_info < (2, 7) and not drive_or_unc:
 480         drive_or_unc, _ = os.path.splitunc(s)
 481     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 482     if drive_or_unc:
 483         norm_path.pop(0)
 484     sanitized_path = [
 485         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 486         for path_part in norm_path]
 487     if drive_or_unc:
 488         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 489     return os.path.join(*sanitized_path)
 490
 491
 492 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 493 # unwanted failures due to missing protocol
 494 def sanitize_url(url):
 495     return 'http:%s' % url if url.startswith('//') else url
 496
 497
 498 def sanitized_Request(url, *args, **kwargs):
 499     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 500
 501
 502 def orderedSet(iterable):
 503     """ Remove all duplicates from the input iterable """
 504     res = []
 505     for el in iterable:
 506         if el not in res:
 507             res.append(el)
 508     return res
 509
 510
 511 def _htmlentity_transform(entity_with_semicolon):
 512     """Transforms an HTML entity to a character."""
 513     entity = entity_with_semicolon[:-1]
 514
 515     # Known non-numeric HTML entity
 516     if entity in compat_html_entities.name2codepoint:
 517         return compat_chr(compat_html_entities.name2codepoint[entity])
 518
 519     # TODO: HTML5 allows entities without a semicolon. For example,
 520     # '&Eacuteric' should be decoded as 'Éric'.
 521     if entity_with_semicolon in compat_html_entities_html5:
 522         return compat_html_entities_html5[entity_with_semicolon]
 523
 524     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 525     if mobj is not None:
 526         numstr = mobj.group(1)
 527         if numstr.startswith('x'):
 528             base = 16
 529             numstr = '0%s' % numstr
 530         else:
 531             base = 10
 532         # See https://github.com/rg3/youtube-dl/issues/7518
 533         try:
 534             return compat_chr(int(numstr, base))
 535         except ValueError:
 536             pass
 537
 538     # Unknown entity in name, return its literal representation
 539     return '&%s;' % entity
 540
 541
 542 def unescapeHTML(s):
 543     if s is None:
 544         return None
 545     assert type(s) == compat_str
 546
 547     return re.sub(
 548         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 549
 550
 551 def get_subprocess_encoding():
 552     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 553         # For subprocess calls, encode with locale encoding
 554         # Refer to http://stackoverflow.com/a/9951851/35070
 555         encoding = preferredencoding()
 556     else:
 557         encoding = sys.getfilesystemencoding()
 558     if encoding is None:
 559         encoding = 'utf-8'
 560     return encoding
 561
 562
 563 def encodeFilename(s, for_subprocess=False):
 564     """
 565     @param s The name of the file
 566     """
 567
 568     assert type(s) == compat_str
 569
 570     # Python 3 has a Unicode API
 571     if sys.version_info >= (3, 0):
 572         return s
 573
 574     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 575     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 576     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 577     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 578         return s
 579
 580     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 581     if sys.platform.startswith('java'):
 582         return s
 583
 584     return s.encode(get_subprocess_encoding(), 'ignore')
 585
 586
 587 def decodeFilename(b, for_subprocess=False):
 588
 589     if sys.version_info >= (3, 0):
 590         return b
 591
 592     if not isinstance(b, bytes):
 593         return b
 594
 595     return b.decode(get_subprocess_encoding(), 'ignore')
 596
 597
 598 def encodeArgument(s):
 599     if not isinstance(s, compat_str):
 600         # Legacy code that uses byte strings
 601         # Uncomment the following line after fixing all post processors
 602         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 603         s = s.decode('ascii')
 604     return encodeFilename(s, True)
 605
 606
 607 def decodeArgument(b):
 608     return decodeFilename(b, True)
 609
 610
 611 def decodeOption(optval):
 612     if optval is None:
 613         return optval
 614     if isinstance(optval, bytes):
 615         optval = optval.decode(preferredencoding())
 616
 617     assert isinstance(optval, compat_str)
 618     return optval
 619
 620
 621 def formatSeconds(secs):
 622     if secs > 3600:
 623         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 624     elif secs > 60:
 625         return '%d:%02d' % (secs // 60, secs % 60)
 626     else:
 627         return '%d' % secs
 628
 629
 630 def make_HTTPS_handler(params, **kwargs):
 631     opts_no_check_certificate = params.get('nocheckcertificate', False)
 632     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 633         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 634         if opts_no_check_certificate:
 635             context.check_hostname = False
 636             context.verify_mode = ssl.CERT_NONE
 637         try:
 638             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 639         except TypeError:
 640             # Python 2.7.8
 641             # (create_default_context present but HTTPSHandler has no context=)
 642             pass
 643
 644     if sys.version_info < (3, 2):
 645         return YoutubeDLHTTPSHandler(params, **kwargs)
 646     else:  # Python < 3.4
 647         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 648         context.verify_mode = (ssl.CERT_NONE
 649                                if opts_no_check_certificate
 650                                else ssl.CERT_REQUIRED)
 651         context.set_default_verify_paths()
 652         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 653
 654
 655 def bug_reports_message():
 656     if ytdl_is_updateable():
 657         update_cmd = 'type  youtube-dl -U  to update'
 658     else:
 659         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 660     msg = '; please report this issue on https://yt-dl.org/bug .'
 661     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 662     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 663     return msg
 664
 665
 666 class ExtractorError(Exception):
 667     """Error during info extraction."""
 668
 669     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 670         """ tb, if given, is the original traceback (so that it can be printed out).
 671         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 672         """
 673
 674         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 675             expected = True
 676         if video_id is not None:
 677             msg = video_id + ': ' + msg
 678         if cause:
 679             msg += ' (caused by %r)' % cause
 680         if not expected:
 681             msg += bug_reports_message()
 682         super(ExtractorError, self).__init__(msg)
 683
 684         self.traceback = tb
 685         self.exc_info = sys.exc_info()  # preserve original exception
 686         self.cause = cause
 687         self.video_id = video_id
 688
 689     def format_traceback(self):
 690         if self.traceback is None:
 691             return None
 692         return ''.join(traceback.format_tb(self.traceback))
 693
 694
 695 class UnsupportedError(ExtractorError):
 696     def __init__(self, url):
 697         super(UnsupportedError, self).__init__(
 698             'Unsupported URL: %s' % url, expected=True)
 699         self.url = url
 700
 701
 702 class RegexNotFoundError(ExtractorError):
 703     """Error when a regex didn't match"""
 704     pass
 705
 706
 707 class DownloadError(Exception):
 708     """Download Error exception.
 709
 710     This exception may be thrown by FileDownloader objects if they are not
 711     configured to continue on errors. They will contain the appropriate
 712     error message.
 713     """
 714
 715     def __init__(self, msg, exc_info=None):
 716         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 717         super(DownloadError, self).__init__(msg)
 718         self.exc_info = exc_info
 719
 720
 721 class SameFileError(Exception):
 722     """Same File exception.
 723
 724     This exception will be thrown by FileDownloader objects if they detect
 725     multiple files would have to be downloaded to the same file on disk.
 726     """
 727     pass
 728
 729
 730 class PostProcessingError(Exception):
 731     """Post Processing exception.
 732
 733     This exception may be raised by PostProcessor's .run() method to
 734     indicate an error in the postprocessing task.
 735     """
 736
 737     def __init__(self, msg):
 738         self.msg = msg
 739
 740
 741 class MaxDownloadsReached(Exception):
 742     """ --max-downloads limit has been reached. """
 743     pass
 744
 745
 746 class UnavailableVideoError(Exception):
 747     """Unavailable Format exception.
 748
 749     This exception will be thrown when a video is requested
 750     in a format that is not available for that video.
 751     """
 752     pass
 753
 754
 755 class ContentTooShortError(Exception):
 756     """Content Too Short exception.
 757
 758     This exception may be raised by FileDownloader objects when a file they
 759     download is too small for what the server announced first, indicating
 760     the connection was probably interrupted.
 761     """
 762
 763     def __init__(self, downloaded, expected):
 764         # Both in bytes
 765         self.downloaded = downloaded
 766         self.expected = expected
 767
 768
 769 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 770     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 771     # expected HTTP responses to meet HTTP/1.0 or later (see also
 772     # https://github.com/rg3/youtube-dl/issues/6727)
 773     if sys.version_info < (3, 0):
 774         kwargs[b'strict'] = True
 775     hc = http_class(*args, **kwargs)
 776     source_address = ydl_handler._params.get('source_address')
 777     if source_address is not None:
 778         sa = (source_address, 0)
 779         if hasattr(hc, 'source_address'):  # Python 2.7+
 780             hc.source_address = sa
 781         else:  # Python 2.6
 782             def _hc_connect(self, *args, **kwargs):
 783                 sock = compat_socket_create_connection(
 784                     (self.host, self.port), self.timeout, sa)
 785                 if is_https:
 786                     self.sock = ssl.wrap_socket(
 787                         sock, self.key_file, self.cert_file,
 788                         ssl_version=ssl.PROTOCOL_TLSv1)
 789                 else:
 790                     self.sock = sock
 791             hc.connect = functools.partial(_hc_connect, hc)
 792
 793     return hc
 794
 795
 796 def handle_youtubedl_headers(headers):
 797     filtered_headers = headers
 798
 799     if 'Youtubedl-no-compression' in filtered_headers:
 800         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 801         del filtered_headers['Youtubedl-no-compression']
 802
 803     return filtered_headers
 804
 805
 806 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 807     """Handler for HTTP requests and responses.
 808
 809     This class, when installed with an OpenerDirector, automatically adds
 810     the standard headers to every HTTP request and handles gzipped and
 811     deflated responses from web servers. If compression is to be avoided in
 812     a particular request, the original request in the program code only has
 813     to include the HTTP header "Youtubedl-no-compression", which will be
 814     removed before making the real request.
 815
 816     Part of this code was copied from:
 817
 818     http://techknack.net/python-urllib2-handlers/
 819
 820     Andrew Rowls, the author of that code, agreed to release it to the
 821     public domain.
 822     """
 823
 824     def __init__(self, params, *args, **kwargs):
 825         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 826         self._params = params
 827
 828     def http_open(self, req):
 829         conn_class = compat_http_client.HTTPConnection
 830
 831         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 832         if socks_proxy:
 833             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 834             del req.headers['Ytdl-socks-proxy']
 835
 836         return self.do_open(functools.partial(
 837             _create_http_connection, self, conn_class, False),
 838             req)
 839
 840     @staticmethod
 841     def deflate(data):
 842         try:
 843             return zlib.decompress(data, -zlib.MAX_WBITS)
 844         except zlib.error:
 845             return zlib.decompress(data)
 846
 847     @staticmethod
 848     def addinfourl_wrapper(stream, headers, url, code):
 849         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 850             return compat_urllib_request.addinfourl(stream, headers, url, code)
 851         ret = compat_urllib_request.addinfourl(stream, headers, url)
 852         ret.code = code
 853         return ret
 854
 855     def http_request(self, req):
 856         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 857         # always respected by websites, some tend to give out URLs with non percent-encoded
 858         # non-ASCII characters (see telemb.py, ard.py [#3412])
 859         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 860         # To work around aforementioned issue we will replace request's original URL with
 861         # percent-encoded one
 862         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 863         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 864         url = req.get_full_url()
 865         url_escaped = escape_url(url)
 866
 867         # Substitute URL if any change after escaping
 868         if url != url_escaped:
 869             req = update_Request(req, url=url_escaped)
 870
 871         for h, v in std_headers.items():
 872             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 873             # The dict keys are capitalized because of this bug by urllib
 874             if h.capitalize() not in req.headers:
 875                 req.add_header(h, v)
 876
 877         req.headers = handle_youtubedl_headers(req.headers)
 878
 879         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 880             # Python 2.6 is brain-dead when it comes to fragments
 881             req._Request__original = req._Request__original.partition('#')[0]
 882             req._Request__r_type = req._Request__r_type.partition('#')[0]
 883
 884         return req
 885
 886     def http_response(self, req, resp):
 887         old_resp = resp
 888         # gzip
 889         if resp.headers.get('Content-encoding', '') == 'gzip':
 890             content = resp.read()
 891             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 892             try:
 893                 uncompressed = io.BytesIO(gz.read())
 894             except IOError as original_ioerror:
 895                 # There may be junk add the end of the file
 896                 # See http://stackoverflow.com/q/4928560/35070 for details
 897                 for i in range(1, 1024):
 898                     try:
 899                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 900                         uncompressed = io.BytesIO(gz.read())
 901                     except IOError:
 902                         continue
 903                     break
 904                 else:
 905                     raise original_ioerror
 906             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 907             resp.msg = old_resp.msg
 908             del resp.headers['Content-encoding']
 909         # deflate
 910         if resp.headers.get('Content-encoding', '') == 'deflate':
 911             gz = io.BytesIO(self.deflate(resp.read()))
 912             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 913             resp.msg = old_resp.msg
 914             del resp.headers['Content-encoding']
 915         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 916         # https://github.com/rg3/youtube-dl/issues/6457).
 917         if 300 <= resp.code < 400:
 918             location = resp.headers.get('Location')
 919             if location:
 920                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 921                 if sys.version_info >= (3, 0):
 922                     location = location.encode('iso-8859-1').decode('utf-8')
 923                 else:
 924                     location = location.decode('utf-8')
 925                 location_escaped = escape_url(location)
 926                 if location != location_escaped:
 927                     del resp.headers['Location']
 928                     if sys.version_info < (3, 0):
 929                         location_escaped = location_escaped.encode('utf-8')
 930                     resp.headers['Location'] = location_escaped
 931         return resp
 932
 933     https_request = http_request
 934     https_response = http_response
 935
 936
 937 def make_socks_conn_class(base_class, socks_proxy):
 938     assert issubclass(base_class, (
 939         compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
 940
 941     url_components = compat_urlparse.urlparse(socks_proxy)
 942     if url_components.scheme.lower() == 'socks5':
 943         socks_type = ProxyType.SOCKS5
 944     elif url_components.scheme.lower() in ('socks', 'socks4'):
 945         socks_type = ProxyType.SOCKS4
 946     elif url_components.scheme.lower() == 'socks4a':
 947         socks_type = ProxyType.SOCKS4A
 948
 949     def unquote_if_non_empty(s):
 950         if not s:
 951             return s
 952         return compat_urllib_parse_unquote_plus(s)
 953
 954     proxy_args = (
 955         socks_type,
 956         url_components.hostname, url_components.port or 1080,
 957         True,  # Remote DNS
 958         unquote_if_non_empty(url_components.username),
 959         unquote_if_non_empty(url_components.password),
 960     )
 961
 962     class SocksConnection(base_class):
 963         def connect(self):
 964             self.sock = sockssocket()
 965             self.sock.setproxy(*proxy_args)
 966             if type(self.timeout) in (int, float):
 967                 self.sock.settimeout(self.timeout)
 968             self.sock.connect((self.host, self.port))
 969
 970             if isinstance(self, compat_http_client.HTTPSConnection):
 971                 if hasattr(self, '_context'):  # Python > 2.6
 972                     self.sock = self._context.wrap_socket(
 973                         self.sock, server_hostname=self.host)
 974                 else:
 975                     self.sock = ssl.wrap_socket(self.sock)
 976
 977     return SocksConnection
 978
 979
 980 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 981     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 982         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 983         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 984         self._params = params
 985
 986     def https_open(self, req):
 987         kwargs = {}
 988         conn_class = self._https_conn_class
 989
 990         if hasattr(self, '_context'):  # python > 2.6
 991             kwargs['context'] = self._context
 992         if hasattr(self, '_check_hostname'):  # python 3.x
 993             kwargs['check_hostname'] = self._check_hostname
 994
 995         socks_proxy = req.headers.get('Ytdl-socks-proxy')
 996         if socks_proxy:
 997             conn_class = make_socks_conn_class(conn_class, socks_proxy)
 998             del req.headers['Ytdl-socks-proxy']
 999
1000         return self.do_open(functools.partial(
1001             _create_http_connection, self, conn_class, True),
1002             req, **kwargs)
1003
1004
1005 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1006     def __init__(self, cookiejar=None):
1007         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1008
1009     def http_response(self, request, response):
1010         # Python 2 will choke on next HTTP request in row if there are non-ASCII
1011         # characters in Set-Cookie HTTP header of last response (see
1012         # https://github.com/rg3/youtube-dl/issues/6769).
1013         # In order to at least prevent crashing we will percent encode Set-Cookie
1014         # header before HTTPCookieProcessor starts processing it.
1015         # if sys.version_info < (3, 0) and response.headers:
1016         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1017         #         set_cookie = response.headers.get(set_cookie_header)
1018         #         if set_cookie:
1019         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1020         #             if set_cookie != set_cookie_escaped:
1021         #                 del response.headers[set_cookie_header]
1022         #                 response.headers[set_cookie_header] = set_cookie_escaped
1023         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1024
1025     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1026     https_response = http_response
1027
1028
1029 def extract_timezone(date_str):
1030     m = re.search(
1031         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1032         date_str)
1033     if not m:
1034         timezone = datetime.timedelta()
1035     else:
1036         date_str = date_str[:-len(m.group('tz'))]
1037         if not m.group('sign'):
1038             timezone = datetime.timedelta()
1039         else:
1040             sign = 1 if m.group('sign') == '+' else -1
1041             timezone = datetime.timedelta(
1042                 hours=sign * int(m.group('hours')),
1043                 minutes=sign * int(m.group('minutes')))
1044     return timezone, date_str
1045
1046
1047 def parse_iso8601(date_str, delimiter='T', timezone=None):
1048     """ Return a UNIX timestamp from the given date """
1049
1050     if date_str is None:
1051         return None
1052
1053     date_str = re.sub(r'\.[0-9]+', '', date_str)
1054
1055     if timezone is None:
1056         timezone, date_str = extract_timezone(date_str)
1057
1058     try:
1059         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1060         dt = datetime.datetime.strptime(date_str, date_format) - timezone
1061         return calendar.timegm(dt.timetuple())
1062     except ValueError:
1063         pass
1064
1065
1066 def date_formats(day_first=True):
1067     return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1068
1069
1070 def unified_strdate(date_str, day_first=True):
1071     """Return a string with the date in the format YYYYMMDD"""
1072
1073     if date_str is None:
1074         return None
1075     upload_date = None
1076     # Replace commas
1077     date_str = date_str.replace(',', ' ')
1078     # Remove AM/PM + timezone
1079     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1080     _, date_str = extract_timezone(date_str)
1081
1082     for expression in date_formats(day_first):
1083         try:
1084             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1085         except ValueError:
1086             pass
1087     if upload_date is None:
1088         timetuple = email.utils.parsedate_tz(date_str)
1089         if timetuple:
1090             try:
1091                 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1092             except ValueError:
1093                 pass
1094     if upload_date is not None:
1095         return compat_str(upload_date)
1096
1097
1098 def unified_timestamp(date_str, day_first=True):
1099     if date_str is None:
1100         return None
1101
1102     date_str = date_str.replace(',', ' ')
1103
1104     pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
1105     timezone, date_str = extract_timezone(date_str)
1106
1107     # Remove AM/PM + timezone
1108     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1109
1110     for expression in date_formats(day_first):
1111         try:
1112             dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
1113             return calendar.timegm(dt.timetuple())
1114         except ValueError:
1115             pass
1116     timetuple = email.utils.parsedate_tz(date_str)
1117     if timetuple:
1118         return calendar.timegm(timetuple.timetuple())
1119
1120
1121 def determine_ext(url, default_ext='unknown_video'):
1122     if url is None:
1123         return default_ext
1124     guess = url.partition('?')[0].rpartition('.')[2]
1125     if re.match(r'^[A-Za-z0-9]+$', guess):
1126         return guess
1127     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1128     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1129         return guess.rstrip('/')
1130     else:
1131         return default_ext
1132
1133
1134 def subtitles_filename(filename, sub_lang, sub_format):
1135     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
1136
1137
1138 def date_from_str(date_str):
1139     """
1140     Return a datetime object from a string in the format YYYYMMDD or
1141     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1142     today = datetime.date.today()
1143     if date_str in ('now', 'today'):
1144         return today
1145     if date_str == 'yesterday':
1146         return today - datetime.timedelta(days=1)
1147     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1148     if match is not None:
1149         sign = match.group('sign')
1150         time = int(match.group('time'))
1151         if sign == '-':
1152             time = -time
1153         unit = match.group('unit')
1154         # A bad approximation?
1155         if unit == 'month':
1156             unit = 'day'
1157             time *= 30
1158         elif unit == 'year':
1159             unit = 'day'
1160             time *= 365
1161         unit += 's'
1162         delta = datetime.timedelta(**{unit: time})
1163         return today + delta
1164     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1165
1166
1167 def hyphenate_date(date_str):
1168     """
1169     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1170     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1171     if match is not None:
1172         return '-'.join(match.groups())
1173     else:
1174         return date_str
1175
1176
1177 class DateRange(object):
1178     """Represents a time interval between two dates"""
1179
1180     def __init__(self, start=None, end=None):
1181         """start and end must be strings in the format accepted by date"""
1182         if start is not None:
1183             self.start = date_from_str(start)
1184         else:
1185             self.start = datetime.datetime.min.date()
1186         if end is not None:
1187             self.end = date_from_str(end)
1188         else:
1189             self.end = datetime.datetime.max.date()
1190         if self.start > self.end:
1191             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1192
1193     @classmethod
1194     def day(cls, day):
1195         """Returns a range that only contains the given day"""
1196         return cls(day, day)
1197
1198     def __contains__(self, date):
1199         """Check if the date is in the range"""
1200         if not isinstance(date, datetime.date):
1201             date = date_from_str(date)
1202         return self.start <= date <= self.end
1203
1204     def __str__(self):
1205         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1206
1207
1208 def platform_name():
1209     """ Returns the platform name as a compat_str """
1210     res = platform.platform()
1211     if isinstance(res, bytes):
1212         res = res.decode(preferredencoding())
1213
1214     assert isinstance(res, compat_str)
1215     return res
1216
1217
1218 def _windows_write_string(s, out):
1219     """ Returns True if the string was written using special methods,
1220     False if it has yet to be written out."""
1221     # Adapted from http://stackoverflow.com/a/3259271/35070
1222
1223     import ctypes
1224     import ctypes.wintypes
1225
1226     WIN_OUTPUT_IDS = {
1227         1: -11,
1228         2: -12,
1229     }
1230
1231     try:
1232         fileno = out.fileno()
1233     except AttributeError:
1234         # If the output stream doesn't have a fileno, it's virtual
1235         return False
1236     except io.UnsupportedOperation:
1237         # Some strange Windows pseudo files?
1238         return False
1239     if fileno not in WIN_OUTPUT_IDS:
1240         return False
1241
1242     GetStdHandle = ctypes.WINFUNCTYPE(
1243         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1244         (b'GetStdHandle', ctypes.windll.kernel32))
1245     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1246
1247     WriteConsoleW = ctypes.WINFUNCTYPE(
1248         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1249         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1250         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1251     written = ctypes.wintypes.DWORD(0)
1252
1253     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1254     FILE_TYPE_CHAR = 0x0002
1255     FILE_TYPE_REMOTE = 0x8000
1256     GetConsoleMode = ctypes.WINFUNCTYPE(
1257         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1258         ctypes.POINTER(ctypes.wintypes.DWORD))(
1259         (b'GetConsoleMode', ctypes.windll.kernel32))
1260     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1261
1262     def not_a_console(handle):
1263         if handle == INVALID_HANDLE_VALUE or handle is None:
1264             return True
1265         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1266                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1267
1268     if not_a_console(h):
1269         return False
1270
1271     def next_nonbmp_pos(s):
1272         try:
1273             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1274         except StopIteration:
1275             return len(s)
1276
1277     while s:
1278         count = min(next_nonbmp_pos(s), 1024)
1279
1280         ret = WriteConsoleW(
1281             h, s, count if count else 2, ctypes.byref(written), None)
1282         if ret == 0:
1283             raise OSError('Failed to write string')
1284         if not count:  # We just wrote a non-BMP character
1285             assert written.value == 2
1286             s = s[1:]
1287         else:
1288             assert written.value > 0
1289             s = s[written.value:]
1290     return True
1291
1292
1293 def write_string(s, out=None, encoding=None):
1294     if out is None:
1295         out = sys.stderr
1296     assert type(s) == compat_str
1297
1298     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1299         if _windows_write_string(s, out):
1300             return
1301
1302     if ('b' in getattr(out, 'mode', '') or
1303             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1304         byt = s.encode(encoding or preferredencoding(), 'ignore')
1305         out.write(byt)
1306     elif hasattr(out, 'buffer'):
1307         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1308         byt = s.encode(enc, 'ignore')
1309         out.buffer.write(byt)
1310     else:
1311         out.write(s)
1312     out.flush()
1313
1314
1315 def bytes_to_intlist(bs):
1316     if not bs:
1317         return []
1318     if isinstance(bs[0], int):  # Python 3
1319         return list(bs)
1320     else:
1321         return [ord(c) for c in bs]
1322
1323
1324 def intlist_to_bytes(xs):
1325     if not xs:
1326         return b''
1327     return compat_struct_pack('%dB' % len(xs), *xs)
1328
1329
1330 # Cross-platform file locking
1331 if sys.platform == 'win32':
1332     import ctypes.wintypes
1333     import msvcrt
1334
1335     class OVERLAPPED(ctypes.Structure):
1336         _fields_ = [
1337             ('Internal', ctypes.wintypes.LPVOID),
1338             ('InternalHigh', ctypes.wintypes.LPVOID),
1339             ('Offset', ctypes.wintypes.DWORD),
1340             ('OffsetHigh', ctypes.wintypes.DWORD),
1341             ('hEvent', ctypes.wintypes.HANDLE),
1342         ]
1343
1344     kernel32 = ctypes.windll.kernel32
1345     LockFileEx = kernel32.LockFileEx
1346     LockFileEx.argtypes = [
1347         ctypes.wintypes.HANDLE,     # hFile
1348         ctypes.wintypes.DWORD,      # dwFlags
1349         ctypes.wintypes.DWORD,      # dwReserved
1350         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1351         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1352         ctypes.POINTER(OVERLAPPED)  # Overlapped
1353     ]
1354     LockFileEx.restype = ctypes.wintypes.BOOL
1355     UnlockFileEx = kernel32.UnlockFileEx
1356     UnlockFileEx.argtypes = [
1357         ctypes.wintypes.HANDLE,     # hFile
1358         ctypes.wintypes.DWORD,      # dwReserved
1359         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1360         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1361         ctypes.POINTER(OVERLAPPED)  # Overlapped
1362     ]
1363     UnlockFileEx.restype = ctypes.wintypes.BOOL
1364     whole_low = 0xffffffff
1365     whole_high = 0x7fffffff
1366
1367     def _lock_file(f, exclusive):
1368         overlapped = OVERLAPPED()
1369         overlapped.Offset = 0
1370         overlapped.OffsetHigh = 0
1371         overlapped.hEvent = 0
1372         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1373         handle = msvcrt.get_osfhandle(f.fileno())
1374         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1375                           whole_low, whole_high, f._lock_file_overlapped_p):
1376             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1377
1378     def _unlock_file(f):
1379         assert f._lock_file_overlapped_p
1380         handle = msvcrt.get_osfhandle(f.fileno())
1381         if not UnlockFileEx(handle, 0,
1382                             whole_low, whole_high, f._lock_file_overlapped_p):
1383             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1384
1385 else:
1386     # Some platforms, such as Jython, is missing fcntl
1387     try:
1388         import fcntl
1389
1390         def _lock_file(f, exclusive):
1391             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1392
1393         def _unlock_file(f):
1394             fcntl.flock(f, fcntl.LOCK_UN)
1395     except ImportError:
1396         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1397
1398         def _lock_file(f, exclusive):
1399             raise IOError(UNSUPPORTED_MSG)
1400
1401         def _unlock_file(f):
1402             raise IOError(UNSUPPORTED_MSG)
1403
1404
1405 class locked_file(object):
1406     def __init__(self, filename, mode, encoding=None):
1407         assert mode in ['r', 'a', 'w']
1408         self.f = io.open(filename, mode, encoding=encoding)
1409         self.mode = mode
1410
1411     def __enter__(self):
1412         exclusive = self.mode != 'r'
1413         try:
1414             _lock_file(self.f, exclusive)
1415         except IOError:
1416             self.f.close()
1417             raise
1418         return self
1419
1420     def __exit__(self, etype, value, traceback):
1421         try:
1422             _unlock_file(self.f)
1423         finally:
1424             self.f.close()
1425
1426     def __iter__(self):
1427         return iter(self.f)
1428
1429     def write(self, *args):
1430         return self.f.write(*args)
1431
1432     def read(self, *args):
1433         return self.f.read(*args)
1434
1435
1436 def get_filesystem_encoding():
1437     encoding = sys.getfilesystemencoding()
1438     return encoding if encoding is not None else 'utf-8'
1439
1440
1441 def shell_quote(args):
1442     quoted_args = []
1443     encoding = get_filesystem_encoding()
1444     for a in args:
1445         if isinstance(a, bytes):
1446             # We may get a filename encoded with 'encodeFilename'
1447             a = a.decode(encoding)
1448         quoted_args.append(pipes.quote(a))
1449     return ' '.join(quoted_args)
1450
1451
1452 def smuggle_url(url, data):
1453     """ Pass additional data in a URL for internal use. """
1454
1455     url, idata = unsmuggle_url(url, {})
1456     data.update(idata)
1457     sdata = compat_urllib_parse_urlencode(
1458         {'__youtubedl_smuggle': json.dumps(data)})
1459     return url + '#' + sdata
1460
1461
1462 def unsmuggle_url(smug_url, default=None):
1463     if '#__youtubedl_smuggle' not in smug_url:
1464         return smug_url, default
1465     url, _, sdata = smug_url.rpartition('#')
1466     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1467     data = json.loads(jsond)
1468     return url, data
1469
1470
1471 def format_bytes(bytes):
1472     if bytes is None:
1473         return 'N/A'
1474     if type(bytes) is str:
1475         bytes = float(bytes)
1476     if bytes == 0.0:
1477         exponent = 0
1478     else:
1479         exponent = int(math.log(bytes, 1024.0))
1480     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1481     converted = float(bytes) / float(1024 ** exponent)
1482     return '%.2f%s' % (converted, suffix)
1483
1484
1485 def lookup_unit_table(unit_table, s):
1486     units_re = '|'.join(re.escape(u) for u in unit_table)
1487     m = re.match(
1488         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1489     if not m:
1490         return None
1491     num_str = m.group('num').replace(',', '.')
1492     mult = unit_table[m.group('unit')]
1493     return int(float(num_str) * mult)
1494
1495
1496 def parse_filesize(s):
1497     if s is None:
1498         return None
1499
1500     # The lower-case forms are of course incorrect and unofficial,
1501     # but we support those too
1502     _UNIT_TABLE = {
1503         'B': 1,
1504         'b': 1,
1505         'KiB': 1024,
1506         'KB': 1000,
1507         'kB': 1024,
1508         'Kb': 1000,
1509         'MiB': 1024 ** 2,
1510         'MB': 1000 ** 2,
1511         'mB': 1024 ** 2,
1512         'Mb': 1000 ** 2,
1513         'GiB': 1024 ** 3,
1514         'GB': 1000 ** 3,
1515         'gB': 1024 ** 3,
1516         'Gb': 1000 ** 3,
1517         'TiB': 1024 ** 4,
1518         'TB': 1000 ** 4,
1519         'tB': 1024 ** 4,
1520         'Tb': 1000 ** 4,
1521         'PiB': 1024 ** 5,
1522         'PB': 1000 ** 5,
1523         'pB': 1024 ** 5,
1524         'Pb': 1000 ** 5,
1525         'EiB': 1024 ** 6,
1526         'EB': 1000 ** 6,
1527         'eB': 1024 ** 6,
1528         'Eb': 1000 ** 6,
1529         'ZiB': 1024 ** 7,
1530         'ZB': 1000 ** 7,
1531         'zB': 1024 ** 7,
1532         'Zb': 1000 ** 7,
1533         'YiB': 1024 ** 8,
1534         'YB': 1000 ** 8,
1535         'yB': 1024 ** 8,
1536         'Yb': 1000 ** 8,
1537     }
1538
1539     return lookup_unit_table(_UNIT_TABLE, s)
1540
1541
1542 def parse_count(s):
1543     if s is None:
1544         return None
1545
1546     s = s.strip()
1547
1548     if re.match(r'^[\d,.]+$', s):
1549         return str_to_int(s)
1550
1551     _UNIT_TABLE = {
1552         'k': 1000,
1553         'K': 1000,
1554         'm': 1000 ** 2,
1555         'M': 1000 ** 2,
1556         'kk': 1000 ** 2,
1557         'KK': 1000 ** 2,
1558     }
1559
1560     return lookup_unit_table(_UNIT_TABLE, s)
1561
1562
1563 def month_by_name(name):
1564     """ Return the number of a month by (locale-independently) English name """
1565
1566     try:
1567         return ENGLISH_MONTH_NAMES.index(name) + 1
1568     except ValueError:
1569         return None
1570
1571
1572 def month_by_abbreviation(abbrev):
1573     """ Return the number of a month by (locale-independently) English
1574         abbreviations """
1575
1576     try:
1577         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1578     except ValueError:
1579         return None
1580
1581
1582 def fix_xml_ampersands(xml_str):
1583     """Replace all the '&' by '&amp;' in XML"""
1584     return re.sub(
1585         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1586         '&amp;',
1587         xml_str)
1588
1589
1590 def setproctitle(title):
1591     assert isinstance(title, compat_str)
1592
1593     # ctypes in Jython is not complete
1594     # http://bugs.jython.org/issue2148
1595     if sys.platform.startswith('java'):
1596         return
1597
1598     try:
1599         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1600     except OSError:
1601         return
1602     title_bytes = title.encode('utf-8')
1603     buf = ctypes.create_string_buffer(len(title_bytes))
1604     buf.value = title_bytes
1605     try:
1606         libc.prctl(15, buf, 0, 0, 0)
1607     except AttributeError:
1608         return  # Strange libc, just skip this
1609
1610
1611 def remove_start(s, start):
1612     return s[len(start):] if s is not None and s.startswith(start) else s
1613
1614
1615 def remove_end(s, end):
1616     return s[:-len(end)] if s is not None and s.endswith(end) else s
1617
1618
1619 def remove_quotes(s):
1620     if s is None or len(s) < 2:
1621         return s
1622     for quote in ('"', "'", ):
1623         if s[0] == quote and s[-1] == quote:
1624             return s[1:-1]
1625     return s
1626
1627
1628 def url_basename(url):
1629     path = compat_urlparse.urlparse(url).path
1630     return path.strip('/').split('/')[-1]
1631
1632
1633 class HEADRequest(compat_urllib_request.Request):
1634     def get_method(self):
1635         return 'HEAD'
1636
1637
1638 class PUTRequest(compat_urllib_request.Request):
1639     def get_method(self):
1640         return 'PUT'
1641
1642
1643 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1644     if get_attr:
1645         if v is not None:
1646             v = getattr(v, get_attr, None)
1647     if v == '':
1648         v = None
1649     if v is None:
1650         return default
1651     try:
1652         return int(v) * invscale // scale
1653     except ValueError:
1654         return default
1655
1656
1657 def str_or_none(v, default=None):
1658     return default if v is None else compat_str(v)
1659
1660
1661 def str_to_int(int_str):
1662     """ A more relaxed version of int_or_none """
1663     if int_str is None:
1664         return None
1665     int_str = re.sub(r'[,\.\+]', '', int_str)
1666     return int(int_str)
1667
1668
1669 def float_or_none(v, scale=1, invscale=1, default=None):
1670     if v is None:
1671         return default
1672     try:
1673         return float(v) * invscale / scale
1674     except ValueError:
1675         return default
1676
1677
1678 def strip_or_none(v):
1679     return None if v is None else v.strip()
1680
1681
1682 def parse_duration(s):
1683     if not isinstance(s, compat_basestring):
1684         return None
1685
1686     s = s.strip()
1687
1688     days, hours, mins, secs, ms = [None] * 5
1689     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1690     if m:
1691         days, hours, mins, secs, ms = m.groups()
1692     else:
1693         m = re.match(
1694             r'''(?ix)(?:P?T)?
1695                 (?:
1696                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1697                 )?
1698                 (?:
1699                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1700                 )?
1701                 (?:
1702                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1703                 )?
1704                 (?:
1705                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1706                 )?$''', s)
1707         if m:
1708             days, hours, mins, secs, ms = m.groups()
1709         else:
1710             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1711             if m:
1712                 hours, mins = m.groups()
1713             else:
1714                 return None
1715
1716     duration = 0
1717     if secs:
1718         duration += float(secs)
1719     if mins:
1720         duration += float(mins) * 60
1721     if hours:
1722         duration += float(hours) * 60 * 60
1723     if days:
1724         duration += float(days) * 24 * 60 * 60
1725     if ms:
1726         duration += float(ms)
1727     return duration
1728
1729
1730 def prepend_extension(filename, ext, expected_real_ext=None):
1731     name, real_ext = os.path.splitext(filename)
1732     return (
1733         '{0}.{1}{2}'.format(name, ext, real_ext)
1734         if not expected_real_ext or real_ext[1:] == expected_real_ext
1735         else '{0}.{1}'.format(filename, ext))
1736
1737
1738 def replace_extension(filename, ext, expected_real_ext=None):
1739     name, real_ext = os.path.splitext(filename)
1740     return '{0}.{1}'.format(
1741         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1742         ext)
1743
1744
1745 def check_executable(exe, args=[]):
1746     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1747     args can be a list of arguments for a short output (like -version) """
1748     try:
1749         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1750     except OSError:
1751         return False
1752     return exe
1753
1754
1755 def get_exe_version(exe, args=['--version'],
1756                     version_re=None, unrecognized='present'):
1757     """ Returns the version of the specified executable,
1758     or False if the executable is not present """
1759     try:
1760         out, _ = subprocess.Popen(
1761             [encodeArgument(exe)] + args,
1762             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1763     except OSError:
1764         return False
1765     if isinstance(out, bytes):  # Python 2.x
1766         out = out.decode('ascii', 'ignore')
1767     return detect_exe_version(out, version_re, unrecognized)
1768
1769
1770 def detect_exe_version(output, version_re=None, unrecognized='present'):
1771     assert isinstance(output, compat_str)
1772     if version_re is None:
1773         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1774     m = re.search(version_re, output)
1775     if m:
1776         return m.group(1)
1777     else:
1778         return unrecognized
1779
1780
1781 class PagedList(object):
1782     def __len__(self):
1783         # This is only useful for tests
1784         return len(self.getslice())
1785
1786
1787 class OnDemandPagedList(PagedList):
1788     def __init__(self, pagefunc, pagesize, use_cache=False):
1789         self._pagefunc = pagefunc
1790         self._pagesize = pagesize
1791         self._use_cache = use_cache
1792         if use_cache:
1793             self._cache = {}
1794
1795     def getslice(self, start=0, end=None):
1796         res = []
1797         for pagenum in itertools.count(start // self._pagesize):
1798             firstid = pagenum * self._pagesize
1799             nextfirstid = pagenum * self._pagesize + self._pagesize
1800             if start >= nextfirstid:
1801                 continue
1802
1803             page_results = None
1804             if self._use_cache:
1805                 page_results = self._cache.get(pagenum)
1806             if page_results is None:
1807                 page_results = list(self._pagefunc(pagenum))
1808             if self._use_cache:
1809                 self._cache[pagenum] = page_results
1810
1811             startv = (
1812                 start % self._pagesize
1813                 if firstid <= start < nextfirstid
1814                 else 0)
1815
1816             endv = (
1817                 ((end - 1) % self._pagesize) + 1
1818                 if (end is not None and firstid <= end <= nextfirstid)
1819                 else None)
1820
1821             if startv != 0 or endv is not None:
1822                 page_results = page_results[startv:endv]
1823             res.extend(page_results)
1824
1825             # A little optimization - if current page is not "full", ie. does
1826             # not contain page_size videos then we can assume that this page
1827             # is the last one - there are no more ids on further pages -
1828             # i.e. no need to query again.
1829             if len(page_results) + startv < self._pagesize:
1830                 break
1831
1832             # If we got the whole page, but the next page is not interesting,
1833             # break out early as well
1834             if end == nextfirstid:
1835                 break
1836         return res
1837
1838
1839 class InAdvancePagedList(PagedList):
1840     def __init__(self, pagefunc, pagecount, pagesize):
1841         self._pagefunc = pagefunc
1842         self._pagecount = pagecount
1843         self._pagesize = pagesize
1844
1845     def getslice(self, start=0, end=None):
1846         res = []
1847         start_page = start // self._pagesize
1848         end_page = (
1849             self._pagecount if end is None else (end // self._pagesize + 1))
1850         skip_elems = start - start_page * self._pagesize
1851         only_more = None if end is None else end - start
1852         for pagenum in range(start_page, end_page):
1853             page = list(self._pagefunc(pagenum))
1854             if skip_elems:
1855                 page = page[skip_elems:]
1856                 skip_elems = None
1857             if only_more is not None:
1858                 if len(page) < only_more:
1859                     only_more -= len(page)
1860                 else:
1861                     page = page[:only_more]
1862                     res.extend(page)
1863                     break
1864             res.extend(page)
1865         return res
1866
1867
1868 def uppercase_escape(s):
1869     unicode_escape = codecs.getdecoder('unicode_escape')
1870     return re.sub(
1871         r'\\U[0-9a-fA-F]{8}',
1872         lambda m: unicode_escape(m.group(0))[0],
1873         s)
1874
1875
1876 def lowercase_escape(s):
1877     unicode_escape = codecs.getdecoder('unicode_escape')
1878     return re.sub(
1879         r'\\u[0-9a-fA-F]{4}',
1880         lambda m: unicode_escape(m.group(0))[0],
1881         s)
1882
1883
1884 def escape_rfc3986(s):
1885     """Escape non-ASCII characters as suggested by RFC 3986"""
1886     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1887         s = s.encode('utf-8')
1888     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1889
1890
1891 def escape_url(url):
1892     """Escape URL as suggested by RFC 3986"""
1893     url_parsed = compat_urllib_parse_urlparse(url)
1894     return url_parsed._replace(
1895         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1896         path=escape_rfc3986(url_parsed.path),
1897         params=escape_rfc3986(url_parsed.params),
1898         query=escape_rfc3986(url_parsed.query),
1899         fragment=escape_rfc3986(url_parsed.fragment)
1900     ).geturl()
1901
1902
1903 def read_batch_urls(batch_fd):
1904     def fixup(url):
1905         if not isinstance(url, compat_str):
1906             url = url.decode('utf-8', 'replace')
1907         BOM_UTF8 = '\xef\xbb\xbf'
1908         if url.startswith(BOM_UTF8):
1909             url = url[len(BOM_UTF8):]
1910         url = url.strip()
1911         if url.startswith(('#', ';', ']')):
1912             return False
1913         return url
1914
1915     with contextlib.closing(batch_fd) as fd:
1916         return [url for url in map(fixup, fd) if url]
1917
1918
1919 def urlencode_postdata(*args, **kargs):
1920     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1921
1922
1923 def update_url_query(url, query):
1924     if not query:
1925         return url
1926     parsed_url = compat_urlparse.urlparse(url)
1927     qs = compat_parse_qs(parsed_url.query)
1928     qs.update(query)
1929     return compat_urlparse.urlunparse(parsed_url._replace(
1930         query=compat_urllib_parse_urlencode(qs, True)))
1931
1932
1933 def update_Request(req, url=None, data=None, headers={}, query={}):
1934     req_headers = req.headers.copy()
1935     req_headers.update(headers)
1936     req_data = data or req.data
1937     req_url = update_url_query(url or req.get_full_url(), query)
1938     req_get_method = req.get_method()
1939     if req_get_method == 'HEAD':
1940         req_type = HEADRequest
1941     elif req_get_method == 'PUT':
1942         req_type = PUTRequest
1943     else:
1944         req_type = compat_urllib_request.Request
1945     new_req = req_type(
1946         req_url, data=req_data, headers=req_headers,
1947         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1948     if hasattr(req, 'timeout'):
1949         new_req.timeout = req.timeout
1950     return new_req
1951
1952
1953 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1954     if isinstance(key_or_keys, (list, tuple)):
1955         for key in key_or_keys:
1956             if key not in d or d[key] is None or skip_false_values and not d[key]:
1957                 continue
1958             return d[key]
1959         return default
1960     return d.get(key_or_keys, default)
1961
1962
1963 def try_get(src, getter, expected_type=None):
1964     try:
1965         v = getter(src)
1966     except (AttributeError, KeyError, TypeError, IndexError):
1967         pass
1968     else:
1969         if expected_type is None or isinstance(v, expected_type):
1970             return v
1971
1972
1973 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1974     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1975
1976
1977 US_RATINGS = {
1978     'G': 0,
1979     'PG': 10,
1980     'PG-13': 13,
1981     'R': 16,
1982     'NC': 18,
1983 }
1984
1985
1986 def parse_age_limit(s):
1987     if s is None:
1988         return None
1989     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1990     return int(m.group('age')) if m else US_RATINGS.get(s)
1991
1992
1993 def strip_jsonp(code):
1994     return re.sub(
1995         r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1996
1997
1998 def js_to_json(code):
1999     def fix_kv(m):
2000         v = m.group(0)
2001         if v in ('true', 'false', 'null'):
2002             return v
2003         elif v.startswith('/*') or v == ',':
2004             return ""
2005
2006         if v[0] in ("'", '"'):
2007             v = re.sub(r'(?s)\\.|"', lambda m: {
2008                 '"': '\\"',
2009                 "\\'": "'",
2010                 '\\\n': '',
2011                 '\\x': '\\u00',
2012             }.get(m.group(0), m.group(0)), v[1:-1])
2013
2014         INTEGER_TABLE = (
2015             (r'^0[xX][0-9a-fA-F]+', 16),
2016             (r'^0+[0-7]+', 8),
2017         )
2018
2019         for regex, base in INTEGER_TABLE:
2020             im = re.match(regex, v)
2021             if im:
2022                 i = int(im.group(0), base)
2023                 return '"%d":' % i if v.endswith(':') else '%d' % i
2024
2025         return '"%s"' % v
2026
2027     return re.sub(r'''(?sx)
2028         "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2029         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2030         /\*.*?\*/|,(?=\s*[\]}])|
2031         [a-zA-Z_][.a-zA-Z_0-9]*|
2032         \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
2033         [0-9]+(?=\s*:)
2034         ''', fix_kv, code)
2035
2036
2037 def qualities(quality_ids):
2038     """ Get a numeric quality value out of a list of possible values """
2039     def q(qid):
2040         try:
2041             return quality_ids.index(qid)
2042         except ValueError:
2043             return -1
2044     return q
2045
2046
2047 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
2048
2049
2050 def limit_length(s, length):
2051     """ Add ellipses to overly long strings """
2052     if s is None:
2053         return None
2054     ELLIPSES = '...'
2055     if len(s) > length:
2056         return s[:length - len(ELLIPSES)] + ELLIPSES
2057     return s
2058
2059
2060 def version_tuple(v):
2061     return tuple(int(e) for e in re.split(r'[-.]', v))
2062
2063
2064 def is_outdated_version(version, limit, assume_new=True):
2065     if not version:
2066         return not assume_new
2067     try:
2068         return version_tuple(version) < version_tuple(limit)
2069     except ValueError:
2070         return not assume_new
2071
2072
2073 def ytdl_is_updateable():
2074     """ Returns if youtube-dl can be updated with -U """
2075     from zipimport import zipimporter
2076
2077     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
2078
2079
2080 def args_to_str(args):
2081     # Get a short string representation for a subprocess command
2082     return ' '.join(compat_shlex_quote(a) for a in args)
2083
2084
2085 def error_to_compat_str(err):
2086     err_str = str(err)
2087     # On python 2 error byte string must be decoded with proper
2088     # encoding rather than ascii
2089     if sys.version_info[0] < 3:
2090         err_str = err_str.decode(preferredencoding())
2091     return err_str
2092
2093
2094 def mimetype2ext(mt):
2095     if mt is None:
2096         return None
2097
2098     ext = {
2099         'audio/mp4': 'm4a',
2100         # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2101         # it's the most popular one
2102         'audio/mpeg': 'mp3',
2103     }.get(mt)
2104     if ext is not None:
2105         return ext
2106
2107     _, _, res = mt.rpartition('/')
2108     res = res.lower()
2109
2110     return {
2111         '3gpp': '3gp',
2112         'smptett+xml': 'tt',
2113         'srt': 'srt',
2114         'ttaf+xml': 'dfxp',
2115         'ttml+xml': 'ttml',
2116         'vtt': 'vtt',
2117         'x-flv': 'flv',
2118         'x-mp4-fragmented': 'mp4',
2119         'x-ms-wmv': 'wmv',
2120         'mpegurl': 'm3u8',
2121         'x-mpegurl': 'm3u8',
2122         'vnd.apple.mpegurl': 'm3u8',
2123         'dash+xml': 'mpd',
2124         'f4m': 'f4m',
2125         'f4m+xml': 'f4m',
2126     }.get(res, res)
2127
2128
2129 def urlhandle_detect_ext(url_handle):
2130     getheader = url_handle.headers.get
2131
2132     cd = getheader('Content-Disposition')
2133     if cd:
2134         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2135         if m:
2136             e = determine_ext(m.group('filename'), default_ext=None)
2137             if e:
2138                 return e
2139
2140     return mimetype2ext(getheader('Content-Type'))
2141
2142
2143 def encode_data_uri(data, mime_type):
2144     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2145
2146
2147 def age_restricted(content_limit, age_limit):
2148     """ Returns True iff the content should be blocked """
2149
2150     if age_limit is None:  # No limit set
2151         return False
2152     if content_limit is None:
2153         return False  # Content available for everyone
2154     return age_limit < content_limit
2155
2156
2157 def is_html(first_bytes):
2158     """ Detect whether a file contains HTML by examining its first bytes. """
2159
2160     BOMS = [
2161         (b'\xef\xbb\xbf', 'utf-8'),
2162         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2163         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2164         (b'\xff\xfe', 'utf-16-le'),
2165         (b'\xfe\xff', 'utf-16-be'),
2166     ]
2167     for bom, enc in BOMS:
2168         if first_bytes.startswith(bom):
2169             s = first_bytes[len(bom):].decode(enc, 'replace')
2170             break
2171     else:
2172         s = first_bytes.decode('utf-8', 'replace')
2173
2174     return re.match(r'^\s*<', s)
2175
2176
2177 def determine_protocol(info_dict):
2178     protocol = info_dict.get('protocol')
2179     if protocol is not None:
2180         return protocol
2181
2182     url = info_dict['url']
2183     if url.startswith('rtmp'):
2184         return 'rtmp'
2185     elif url.startswith('mms'):
2186         return 'mms'
2187     elif url.startswith('rtsp'):
2188         return 'rtsp'
2189
2190     ext = determine_ext(url)
2191     if ext == 'm3u8':
2192         return 'm3u8'
2193     elif ext == 'f4m':
2194         return 'f4m'
2195
2196     return compat_urllib_parse_urlparse(url).scheme
2197
2198
2199 def render_table(header_row, data):
2200     """ Render a list of rows, each as a list of values """
2201     table = [header_row] + data
2202     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2203     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2204     return '\n'.join(format_str % tuple(row) for row in table)
2205
2206
2207 def _match_one(filter_part, dct):
2208     COMPARISON_OPERATORS = {
2209         '<': operator.lt,
2210         '<=': operator.le,
2211         '>': operator.gt,
2212         '>=': operator.ge,
2213         '=': operator.eq,
2214         '!=': operator.ne,
2215     }
2216     operator_rex = re.compile(r'''(?x)\s*
2217         (?P<key>[a-z_]+)
2218         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2219         (?:
2220             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2221             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2222         )
2223         \s*$
2224         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2225     m = operator_rex.search(filter_part)
2226     if m:
2227         op = COMPARISON_OPERATORS[m.group('op')]
2228         if m.group('strval') is not None:
2229             if m.group('op') not in ('=', '!='):
2230                 raise ValueError(
2231                     'Operator %s does not support string values!' % m.group('op'))
2232             comparison_value = m.group('strval')
2233         else:
2234             try:
2235                 comparison_value = int(m.group('intval'))
2236             except ValueError:
2237                 comparison_value = parse_filesize(m.group('intval'))
2238                 if comparison_value is None:
2239                     comparison_value = parse_filesize(m.group('intval') + 'B')
2240                 if comparison_value is None:
2241                     raise ValueError(
2242                         'Invalid integer value %r in filter part %r' % (
2243                             m.group('intval'), filter_part))
2244         actual_value = dct.get(m.group('key'))
2245         if actual_value is None:
2246             return m.group('none_inclusive')
2247         return op(actual_value, comparison_value)
2248
2249     UNARY_OPERATORS = {
2250         '': lambda v: v is not None,
2251         '!': lambda v: v is None,
2252     }
2253     operator_rex = re.compile(r'''(?x)\s*
2254         (?P<op>%s)\s*(?P<key>[a-z_]+)
2255         \s*$
2256         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2257     m = operator_rex.search(filter_part)
2258     if m:
2259         op = UNARY_OPERATORS[m.group('op')]
2260         actual_value = dct.get(m.group('key'))
2261         return op(actual_value)
2262
2263     raise ValueError('Invalid filter part %r' % filter_part)
2264
2265
2266 def match_str(filter_str, dct):
2267     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2268
2269     return all(
2270         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2271
2272
2273 def match_filter_func(filter_str):
2274     def _match_func(info_dict):
2275         if match_str(filter_str, info_dict):
2276             return None
2277         else:
2278             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2279             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2280     return _match_func
2281
2282
2283 def parse_dfxp_time_expr(time_expr):
2284     if not time_expr:
2285         return
2286
2287     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2288     if mobj:
2289         return float(mobj.group('time_offset'))
2290
2291     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2292     if mobj:
2293         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2294
2295
2296 def srt_subtitles_timecode(seconds):
2297     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2298
2299
2300 def dfxp2srt(dfxp_data):
2301     _x = functools.partial(xpath_with_ns, ns_map={
2302         'ttml': 'http://www.w3.org/ns/ttml',
2303         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2304         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2305     })
2306
2307     class TTMLPElementParser(object):
2308         out = ''
2309
2310         def start(self, tag, attrib):
2311             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2312                 self.out += '\n'
2313
2314         def end(self, tag):
2315             pass
2316
2317         def data(self, data):
2318             self.out += data
2319
2320         def close(self):
2321             return self.out.strip()
2322
2323     def parse_node(node):
2324         target = TTMLPElementParser()
2325         parser = xml.etree.ElementTree.XMLParser(target=target)
2326         parser.feed(xml.etree.ElementTree.tostring(node))
2327         return parser.close()
2328
2329     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2330     out = []
2331     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2332
2333     if not paras:
2334         raise ValueError('Invalid dfxp/TTML subtitle')
2335
2336     for para, index in zip(paras, itertools.count(1)):
2337         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2338         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2339         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2340         if begin_time is None:
2341             continue
2342         if not end_time:
2343             if not dur:
2344                 continue
2345             end_time = begin_time + dur
2346         out.append('%d\n%s --> %s\n%s\n\n' % (
2347             index,
2348             srt_subtitles_timecode(begin_time),
2349             srt_subtitles_timecode(end_time),
2350             parse_node(para)))
2351
2352     return ''.join(out)
2353
2354
2355 def cli_option(params, command_option, param):
2356     param = params.get(param)
2357     return [command_option, param] if param is not None else []
2358
2359
2360 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2361     param = params.get(param)
2362     assert isinstance(param, bool)
2363     if separator:
2364         return [command_option + separator + (true_value if param else false_value)]
2365     return [command_option, true_value if param else false_value]
2366
2367
2368 def cli_valueless_option(params, command_option, param, expected_value=True):
2369     param = params.get(param)
2370     return [command_option] if param == expected_value else []
2371
2372
2373 def cli_configuration_args(params, param, default=[]):
2374     ex_args = params.get(param)
2375     if ex_args is None:
2376         return default
2377     assert isinstance(ex_args, list)
2378     return ex_args
2379
2380
2381 class ISO639Utils(object):
2382     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2383     _lang_map = {
2384         'aa': 'aar',
2385         'ab': 'abk',
2386         'ae': 'ave',
2387         'af': 'afr',
2388         'ak': 'aka',
2389         'am': 'amh',
2390         'an': 'arg',
2391         'ar': 'ara',
2392         'as': 'asm',
2393         'av': 'ava',
2394         'ay': 'aym',
2395         'az': 'aze',
2396         'ba': 'bak',
2397         'be': 'bel',
2398         'bg': 'bul',
2399         'bh': 'bih',
2400         'bi': 'bis',
2401         'bm': 'bam',
2402         'bn': 'ben',
2403         'bo': 'bod',
2404         'br': 'bre',
2405         'bs': 'bos',
2406         'ca': 'cat',
2407         'ce': 'che',
2408         'ch': 'cha',
2409         'co': 'cos',
2410         'cr': 'cre',
2411         'cs': 'ces',
2412         'cu': 'chu',
2413         'cv': 'chv',
2414         'cy': 'cym',
2415         'da': 'dan',
2416         'de': 'deu',
2417         'dv': 'div',
2418         'dz': 'dzo',
2419         'ee': 'ewe',
2420         'el': 'ell',
2421         'en': 'eng',
2422         'eo': 'epo',
2423         'es': 'spa',
2424         'et': 'est',
2425         'eu': 'eus',
2426         'fa': 'fas',
2427         'ff': 'ful',
2428         'fi': 'fin',
2429         'fj': 'fij',
2430         'fo': 'fao',
2431         'fr': 'fra',
2432         'fy': 'fry',
2433         'ga': 'gle',
2434         'gd': 'gla',
2435         'gl': 'glg',
2436         'gn': 'grn',
2437         'gu': 'guj',
2438         'gv': 'glv',
2439         'ha': 'hau',
2440         'he': 'heb',
2441         'hi': 'hin',
2442         'ho': 'hmo',
2443         'hr': 'hrv',
2444         'ht': 'hat',
2445         'hu': 'hun',
2446         'hy': 'hye',
2447         'hz': 'her',
2448         'ia': 'ina',
2449         'id': 'ind',
2450         'ie': 'ile',
2451         'ig': 'ibo',
2452         'ii': 'iii',
2453         'ik': 'ipk',
2454         'io': 'ido',
2455         'is': 'isl',
2456         'it': 'ita',
2457         'iu': 'iku',
2458         'ja': 'jpn',
2459         'jv': 'jav',
2460         'ka': 'kat',
2461         'kg': 'kon',
2462         'ki': 'kik',
2463         'kj': 'kua',
2464         'kk': 'kaz',
2465         'kl': 'kal',
2466         'km': 'khm',
2467         'kn': 'kan',
2468         'ko': 'kor',
2469         'kr': 'kau',
2470         'ks': 'kas',
2471         'ku': 'kur',
2472         'kv': 'kom',
2473         'kw': 'cor',
2474         'ky': 'kir',
2475         'la': 'lat',
2476         'lb': 'ltz',
2477         'lg': 'lug',
2478         'li': 'lim',
2479         'ln': 'lin',
2480         'lo': 'lao',
2481         'lt': 'lit',
2482         'lu': 'lub',
2483         'lv': 'lav',
2484         'mg': 'mlg',
2485         'mh': 'mah',
2486         'mi': 'mri',
2487         'mk': 'mkd',
2488         'ml': 'mal',
2489         'mn': 'mon',
2490         'mr': 'mar',
2491         'ms': 'msa',
2492         'mt': 'mlt',
2493         'my': 'mya',
2494         'na': 'nau',
2495         'nb': 'nob',
2496         'nd': 'nde',
2497         'ne': 'nep',
2498         'ng': 'ndo',
2499         'nl': 'nld',
2500         'nn': 'nno',
2501         'no': 'nor',
2502         'nr': 'nbl',
2503         'nv': 'nav',
2504         'ny': 'nya',
2505         'oc': 'oci',
2506         'oj': 'oji',
2507         'om': 'orm',
2508         'or': 'ori',
2509         'os': 'oss',
2510         'pa': 'pan',
2511         'pi': 'pli',
2512         'pl': 'pol',
2513         'ps': 'pus',
2514         'pt': 'por',
2515         'qu': 'que',
2516         'rm': 'roh',
2517         'rn': 'run',
2518         'ro': 'ron',
2519         'ru': 'rus',
2520         'rw': 'kin',
2521         'sa': 'san',
2522         'sc': 'srd',
2523         'sd': 'snd',
2524         'se': 'sme',
2525         'sg': 'sag',
2526         'si': 'sin',
2527         'sk': 'slk',
2528         'sl': 'slv',
2529         'sm': 'smo',
2530         'sn': 'sna',
2531         'so': 'som',
2532         'sq': 'sqi',
2533         'sr': 'srp',
2534         'ss': 'ssw',
2535         'st': 'sot',
2536         'su': 'sun',
2537         'sv': 'swe',
2538         'sw': 'swa',
2539         'ta': 'tam',
2540         'te': 'tel',
2541         'tg': 'tgk',
2542         'th': 'tha',
2543         'ti': 'tir',
2544         'tk': 'tuk',
2545         'tl': 'tgl',
2546         'tn': 'tsn',
2547         'to': 'ton',
2548         'tr': 'tur',
2549         'ts': 'tso',
2550         'tt': 'tat',
2551         'tw': 'twi',
2552         'ty': 'tah',
2553         'ug': 'uig',
2554         'uk': 'ukr',
2555         'ur': 'urd',
2556         'uz': 'uzb',
2557         've': 'ven',
2558         'vi': 'vie',
2559         'vo': 'vol',
2560         'wa': 'wln',
2561         'wo': 'wol',
2562         'xh': 'xho',
2563         'yi': 'yid',
2564         'yo': 'yor',
2565         'za': 'zha',
2566         'zh': 'zho',
2567         'zu': 'zul',
2568     }
2569
2570     @classmethod
2571     def short2long(cls, code):
2572         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2573         return cls._lang_map.get(code[:2])
2574
2575     @classmethod
2576     def long2short(cls, code):
2577         """Convert language code from ISO 639-2/T to ISO 639-1"""
2578         for short_name, long_name in cls._lang_map.items():
2579             if long_name == code:
2580                 return short_name
2581
2582
2583 class ISO3166Utils(object):
2584     # From http://data.okfn.org/data/core/country-list
2585     _country_map = {
2586         'AF': 'Afghanistan',
2587         'AX': 'Åland Islands',
2588         'AL': 'Albania',
2589         'DZ': 'Algeria',
2590         'AS': 'American Samoa',
2591         'AD': 'Andorra',
2592         'AO': 'Angola',
2593         'AI': 'Anguilla',
2594         'AQ': 'Antarctica',
2595         'AG': 'Antigua and Barbuda',
2596         'AR': 'Argentina',
2597         'AM': 'Armenia',
2598         'AW': 'Aruba',
2599         'AU': 'Australia',
2600         'AT': 'Austria',
2601         'AZ': 'Azerbaijan',
2602         'BS': 'Bahamas',
2603         'BH': 'Bahrain',
2604         'BD': 'Bangladesh',
2605         'BB': 'Barbados',
2606         'BY': 'Belarus',
2607         'BE': 'Belgium',
2608         'BZ': 'Belize',
2609         'BJ': 'Benin',
2610         'BM': 'Bermuda',
2611         'BT': 'Bhutan',
2612         'BO': 'Bolivia, Plurinational State of',
2613         'BQ': 'Bonaire, Sint Eustatius and Saba',
2614         'BA': 'Bosnia and Herzegovina',
2615         'BW': 'Botswana',
2616         'BV': 'Bouvet Island',
2617         'BR': 'Brazil',
2618         'IO': 'British Indian Ocean Territory',
2619         'BN': 'Brunei Darussalam',
2620         'BG': 'Bulgaria',
2621         'BF': 'Burkina Faso',
2622         'BI': 'Burundi',
2623         'KH': 'Cambodia',
2624         'CM': 'Cameroon',
2625         'CA': 'Canada',
2626         'CV': 'Cape Verde',
2627         'KY': 'Cayman Islands',
2628         'CF': 'Central African Republic',
2629         'TD': 'Chad',
2630         'CL': 'Chile',
2631         'CN': 'China',
2632         'CX': 'Christmas Island',
2633         'CC': 'Cocos (Keeling) Islands',
2634         'CO': 'Colombia',
2635         'KM': 'Comoros',
2636         'CG': 'Congo',
2637         'CD': 'Congo, the Democratic Republic of the',
2638         'CK': 'Cook Islands',
2639         'CR': 'Costa Rica',
2640         'CI': 'Côte d\'Ivoire',
2641         'HR': 'Croatia',
2642         'CU': 'Cuba',
2643         'CW': 'Curaçao',
2644         'CY': 'Cyprus',
2645         'CZ': 'Czech Republic',
2646         'DK': 'Denmark',
2647         'DJ': 'Djibouti',
2648         'DM': 'Dominica',
2649         'DO': 'Dominican Republic',
2650         'EC': 'Ecuador',
2651         'EG': 'Egypt',
2652         'SV': 'El Salvador',
2653         'GQ': 'Equatorial Guinea',
2654         'ER': 'Eritrea',
2655         'EE': 'Estonia',
2656         'ET': 'Ethiopia',
2657         'FK': 'Falkland Islands (Malvinas)',
2658         'FO': 'Faroe Islands',
2659         'FJ': 'Fiji',
2660         'FI': 'Finland',
2661         'FR': 'France',
2662         'GF': 'French Guiana',
2663         'PF': 'French Polynesia',
2664         'TF': 'French Southern Territories',
2665         'GA': 'Gabon',
2666         'GM': 'Gambia',
2667         'GE': 'Georgia',
2668         'DE': 'Germany',
2669         'GH': 'Ghana',
2670         'GI': 'Gibraltar',
2671         'GR': 'Greece',
2672         'GL': 'Greenland',
2673         'GD': 'Grenada',
2674         'GP': 'Guadeloupe',
2675         'GU': 'Guam',
2676         'GT': 'Guatemala',
2677         'GG': 'Guernsey',
2678         'GN': 'Guinea',
2679         'GW': 'Guinea-Bissau',
2680         'GY': 'Guyana',
2681         'HT': 'Haiti',
2682         'HM': 'Heard Island and McDonald Islands',
2683         'VA': 'Holy See (Vatican City State)',
2684         'HN': 'Honduras',
2685         'HK': 'Hong Kong',
2686         'HU': 'Hungary',
2687         'IS': 'Iceland',
2688         'IN': 'India',
2689         'ID': 'Indonesia',
2690         'IR': 'Iran, Islamic Republic of',
2691         'IQ': 'Iraq',
2692         'IE': 'Ireland',
2693         'IM': 'Isle of Man',
2694         'IL': 'Israel',
2695         'IT': 'Italy',
2696         'JM': 'Jamaica',
2697         'JP': 'Japan',
2698         'JE': 'Jersey',
2699         'JO': 'Jordan',
2700         'KZ': 'Kazakhstan',
2701         'KE': 'Kenya',
2702         'KI': 'Kiribati',
2703         'KP': 'Korea, Democratic People\'s Republic of',
2704         'KR': 'Korea, Republic of',
2705         'KW': 'Kuwait',
2706         'KG': 'Kyrgyzstan',
2707         'LA': 'Lao People\'s Democratic Republic',
2708         'LV': 'Latvia',
2709         'LB': 'Lebanon',
2710         'LS': 'Lesotho',
2711         'LR': 'Liberia',
2712         'LY': 'Libya',
2713         'LI': 'Liechtenstein',
2714         'LT': 'Lithuania',
2715         'LU': 'Luxembourg',
2716         'MO': 'Macao',
2717         'MK': 'Macedonia, the Former Yugoslav Republic of',
2718         'MG': 'Madagascar',
2719         'MW': 'Malawi',
2720         'MY': 'Malaysia',
2721         'MV': 'Maldives',
2722         'ML': 'Mali',
2723         'MT': 'Malta',
2724         'MH': 'Marshall Islands',
2725         'MQ': 'Martinique',
2726         'MR': 'Mauritania',
2727         'MU': 'Mauritius',
2728         'YT': 'Mayotte',
2729         'MX': 'Mexico',
2730         'FM': 'Micronesia, Federated States of',
2731         'MD': 'Moldova, Republic of',
2732         'MC': 'Monaco',
2733         'MN': 'Mongolia',
2734         'ME': 'Montenegro',
2735         'MS': 'Montserrat',
2736         'MA': 'Morocco',
2737         'MZ': 'Mozambique',
2738         'MM': 'Myanmar',
2739         'NA': 'Namibia',
2740         'NR': 'Nauru',
2741         'NP': 'Nepal',
2742         'NL': 'Netherlands',
2743         'NC': 'New Caledonia',
2744         'NZ': 'New Zealand',
2745         'NI': 'Nicaragua',
2746         'NE': 'Niger',
2747         'NG': 'Nigeria',
2748         'NU': 'Niue',
2749         'NF': 'Norfolk Island',
2750         'MP': 'Northern Mariana Islands',
2751         'NO': 'Norway',
2752         'OM': 'Oman',
2753         'PK': 'Pakistan',
2754         'PW': 'Palau',
2755         'PS': 'Palestine, State of',
2756         'PA': 'Panama',
2757         'PG': 'Papua New Guinea',
2758         'PY': 'Paraguay',
2759         'PE': 'Peru',
2760         'PH': 'Philippines',
2761         'PN': 'Pitcairn',
2762         'PL': 'Poland',
2763         'PT': 'Portugal',
2764         'PR': 'Puerto Rico',
2765         'QA': 'Qatar',
2766         'RE': 'Réunion',
2767         'RO': 'Romania',
2768         'RU': 'Russian Federation',
2769         'RW': 'Rwanda',
2770         'BL': 'Saint Barthélemy',
2771         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2772         'KN': 'Saint Kitts and Nevis',
2773         'LC': 'Saint Lucia',
2774         'MF': 'Saint Martin (French part)',
2775         'PM': 'Saint Pierre and Miquelon',
2776         'VC': 'Saint Vincent and the Grenadines',
2777         'WS': 'Samoa',
2778         'SM': 'San Marino',
2779         'ST': 'Sao Tome and Principe',
2780         'SA': 'Saudi Arabia',
2781         'SN': 'Senegal',
2782         'RS': 'Serbia',
2783         'SC': 'Seychelles',
2784         'SL': 'Sierra Leone',
2785         'SG': 'Singapore',
2786         'SX': 'Sint Maarten (Dutch part)',
2787         'SK': 'Slovakia',
2788         'SI': 'Slovenia',
2789         'SB': 'Solomon Islands',
2790         'SO': 'Somalia',
2791         'ZA': 'South Africa',
2792         'GS': 'South Georgia and the South Sandwich Islands',
2793         'SS': 'South Sudan',
2794         'ES': 'Spain',
2795         'LK': 'Sri Lanka',
2796         'SD': 'Sudan',
2797         'SR': 'Suriname',
2798         'SJ': 'Svalbard and Jan Mayen',
2799         'SZ': 'Swaziland',
2800         'SE': 'Sweden',
2801         'CH': 'Switzerland',
2802         'SY': 'Syrian Arab Republic',
2803         'TW': 'Taiwan, Province of China',
2804         'TJ': 'Tajikistan',
2805         'TZ': 'Tanzania, United Republic of',
2806         'TH': 'Thailand',
2807         'TL': 'Timor-Leste',
2808         'TG': 'Togo',
2809         'TK': 'Tokelau',
2810         'TO': 'Tonga',
2811         'TT': 'Trinidad and Tobago',
2812         'TN': 'Tunisia',
2813         'TR': 'Turkey',
2814         'TM': 'Turkmenistan',
2815         'TC': 'Turks and Caicos Islands',
2816         'TV': 'Tuvalu',
2817         'UG': 'Uganda',
2818         'UA': 'Ukraine',
2819         'AE': 'United Arab Emirates',
2820         'GB': 'United Kingdom',
2821         'US': 'United States',
2822         'UM': 'United States Minor Outlying Islands',
2823         'UY': 'Uruguay',
2824         'UZ': 'Uzbekistan',
2825         'VU': 'Vanuatu',
2826         'VE': 'Venezuela, Bolivarian Republic of',
2827         'VN': 'Viet Nam',
2828         'VG': 'Virgin Islands, British',
2829         'VI': 'Virgin Islands, U.S.',
2830         'WF': 'Wallis and Futuna',
2831         'EH': 'Western Sahara',
2832         'YE': 'Yemen',
2833         'ZM': 'Zambia',
2834         'ZW': 'Zimbabwe',
2835     }
2836
2837     @classmethod
2838     def short2full(cls, code):
2839         """Convert an ISO 3166-2 country code to the corresponding full name"""
2840         return cls._country_map.get(code.upper())
2841
2842
2843 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2844     def __init__(self, proxies=None):
2845         # Set default handlers
2846         for type in ('http', 'https'):
2847             setattr(self, '%s_open' % type,
2848                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2849                         meth(r, proxy, type))
2850         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2851
2852     def proxy_open(self, req, proxy, type):
2853         req_proxy = req.headers.get('Ytdl-request-proxy')
2854         if req_proxy is not None:
2855             proxy = req_proxy
2856             del req.headers['Ytdl-request-proxy']
2857
2858         if proxy == '__noproxy__':
2859             return None  # No Proxy
2860         if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
2861             req.add_header('Ytdl-socks-proxy', proxy)
2862             # youtube-dl's http/https handlers do wrapping the socket with socks
2863             return None
2864         return compat_urllib_request.ProxyHandler.proxy_open(
2865             self, req, proxy, type)
2866
2867
2868 def ohdave_rsa_encrypt(data, exponent, modulus):
2869     '''
2870     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2871
2872     Input:
2873         data: data to encrypt, bytes-like object
2874         exponent, modulus: parameter e and N of RSA algorithm, both integer
2875     Output: hex string of encrypted data
2876
2877     Limitation: supports one block encryption only
2878     '''
2879
2880     payload = int(binascii.hexlify(data[::-1]), 16)
2881     encrypted = pow(payload, exponent, modulus)
2882     return '%x' % encrypted
2883
2884
2885 def encode_base_n(num, n, table=None):
2886     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2887     if not table:
2888         table = FULL_TABLE[:n]
2889
2890     if n > len(table):
2891         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2892
2893     if num == 0:
2894         return table[0]
2895
2896     ret = ''
2897     while num:
2898         ret = table[num % n] + ret
2899         num = num // n
2900     return ret
2901
2902
2903 def decode_packed_codes(code):
2904     mobj = re.search(
2905         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2906         code)
2907     obfucasted_code, base, count, symbols = mobj.groups()
2908     base = int(base)
2909     count = int(count)
2910     symbols = symbols.split('|')
2911     symbol_table = {}
2912
2913     while count:
2914         count -= 1
2915         base_n_count = encode_base_n(count, base)
2916         symbol_table[base_n_count] = symbols[count] or base_n_count
2917
2918     return re.sub(
2919         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2920         obfucasted_code)
2921
2922
2923 def parse_m3u8_attributes(attrib):
2924     info = {}
2925     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2926         if val.startswith('"'):
2927             val = val[1:-1]
2928         info[key] = val
2929     return info
2930
2931
2932 def urshift(val, n):
2933     return val >> n if val >= 0 else (val + 0x100000000) >> n