youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import base64
   7 import binascii
   8 import calendar
   9 import codecs
  10 import contextlib
  11 import ctypes
  12 import datetime
  13 import email.utils
  14 import errno
  15 import functools
  16 import gzip
  17 import itertools
  18 import io
  19 import json
  20 import locale
  21 import math
  22 import operator
  23 import os
  24 import pipes
  25 import platform
  26 import re
  27 import ssl
  28 import socket
  29 import struct
  30 import subprocess
  31 import sys
  32 import tempfile
  33 import traceback
  34 import xml.etree.ElementTree
  35 import zlib
  36
  37 from .compat import (
  38     compat_HTMLParser,
  39     compat_basestring,
  40     compat_chr,
  41     compat_etree_fromstring,
  42     compat_html_entities,
  43     compat_http_client,
  44     compat_kwargs,
  45     compat_parse_qs,
  46     compat_socket_create_connection,
  47     compat_str,
  48     compat_urllib_error,
  49     compat_urllib_parse,
  50     compat_urllib_parse_urlencode,
  51     compat_urllib_parse_urlparse,
  52     compat_urllib_request,
  53     compat_urlparse,
  54     compat_xpath,
  55     shlex_quote,
  56 )
  57
  58
  59 # This is not clearly defined otherwise
  60 compiled_regex_type = type(re.compile(''))
  61
  62 std_headers = {
  63     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
  64     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  65     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  66     'Accept-Encoding': 'gzip, deflate',
  67     'Accept-Language': 'en-us,en;q=0.5',
  68 }
  69
  70
  71 NO_DEFAULT = object()
  72
  73 ENGLISH_MONTH_NAMES = [
  74     'January', 'February', 'March', 'April', 'May', 'June',
  75     'July', 'August', 'September', 'October', 'November', 'December']
  76
  77 KNOWN_EXTENSIONS = (
  78     'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
  79     'flv', 'f4v', 'f4a', 'f4b',
  80     'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
  81     'mkv', 'mka', 'mk3d',
  82     'avi', 'divx',
  83     'mov',
  84     'asf', 'wmv', 'wma',
  85     '3gp', '3g2',
  86     'mp3',
  87     'flac',
  88     'ape',
  89     'wav',
  90     'f4f', 'f4m', 'm3u8', 'smil')
  91
  92
  93 def preferredencoding():
  94     """Get preferred encoding.
  95
  96     Returns the best encoding scheme for the system, based on
  97     locale.getpreferredencoding() and some further tweaks.
  98     """
  99     try:
 100         pref = locale.getpreferredencoding()
 101         'TEST'.encode(pref)
 102     except Exception:
 103         pref = 'UTF-8'
 104
 105     return pref
 106
 107
 108 def write_json_file(obj, fn):
 109     """ Encode obj as JSON and write it to fn, atomically if possible """
 110
 111     fn = encodeFilename(fn)
 112     if sys.version_info < (3, 0) and sys.platform != 'win32':
 113         encoding = get_filesystem_encoding()
 114         # os.path.basename returns a bytes object, but NamedTemporaryFile
 115         # will fail if the filename contains non ascii characters unless we
 116         # use a unicode object
 117         path_basename = lambda f: os.path.basename(fn).decode(encoding)
 118         # the same for os.path.dirname
 119         path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
 120     else:
 121         path_basename = os.path.basename
 122         path_dirname = os.path.dirname
 123
 124     args = {
 125         'suffix': '.tmp',
 126         'prefix': path_basename(fn) + '.',
 127         'dir': path_dirname(fn),
 128         'delete': False,
 129     }
 130
 131     # In Python 2.x, json.dump expects a bytestream.
 132     # In Python 3.x, it writes to a character stream
 133     if sys.version_info < (3, 0):
 134         args['mode'] = 'wb'
 135     else:
 136         args.update({
 137             'mode': 'w',
 138             'encoding': 'utf-8',
 139         })
 140
 141     tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
 142
 143     try:
 144         with tf:
 145             json.dump(obj, tf)
 146         if sys.platform == 'win32':
 147             # Need to remove existing file on Windows, else os.rename raises
 148             # WindowsError or FileExistsError.
 149             try:
 150                 os.unlink(fn)
 151             except OSError:
 152                 pass
 153         os.rename(tf.name, fn)
 154     except Exception:
 155         try:
 156             os.remove(tf.name)
 157         except OSError:
 158             pass
 159         raise
 160
 161
 162 if sys.version_info >= (2, 7):
 163     def find_xpath_attr(node, xpath, key, val=None):
 164         """ Find the xpath xpath[@key=val] """
 165         assert re.match(r'^[a-zA-Z_-]+$', key)
 166         expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
 167         return node.find(expr)
 168 else:
 169     def find_xpath_attr(node, xpath, key, val=None):
 170         for f in node.findall(compat_xpath(xpath)):
 171             if key not in f.attrib:
 172                 continue
 173             if val is None or f.attrib.get(key) == val:
 174                 return f
 175         return None
 176
 177 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 178 # the namespace parameter
 179
 180
 181 def xpath_with_ns(path, ns_map):
 182     components = [c.split(':') for c in path.split('/')]
 183     replaced = []
 184     for c in components:
 185         if len(c) == 1:
 186             replaced.append(c[0])
 187         else:
 188             ns, tag = c
 189             replaced.append('{%s}%s' % (ns_map[ns], tag))
 190     return '/'.join(replaced)
 191
 192
 193 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 194     def _find_xpath(xpath):
 195         return node.find(compat_xpath(xpath))
 196
 197     if isinstance(xpath, (str, compat_str)):
 198         n = _find_xpath(xpath)
 199     else:
 200         for xp in xpath:
 201             n = _find_xpath(xp)
 202             if n is not None:
 203                 break
 204
 205     if n is None:
 206         if default is not NO_DEFAULT:
 207             return default
 208         elif fatal:
 209             name = xpath if name is None else name
 210             raise ExtractorError('Could not find XML element %s' % name)
 211         else:
 212             return None
 213     return n
 214
 215
 216 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
 217     n = xpath_element(node, xpath, name, fatal=fatal, default=default)
 218     if n is None or n == default:
 219         return n
 220     if n.text is None:
 221         if default is not NO_DEFAULT:
 222             return default
 223         elif fatal:
 224             name = xpath if name is None else name
 225             raise ExtractorError('Could not find XML element\'s text %s' % name)
 226         else:
 227             return None
 228     return n.text
 229
 230
 231 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
 232     n = find_xpath_attr(node, xpath, key)
 233     if n is None:
 234         if default is not NO_DEFAULT:
 235             return default
 236         elif fatal:
 237             name = '%s[@%s]' % (xpath, key) if name is None else name
 238             raise ExtractorError('Could not find XML attribute %s' % name)
 239         else:
 240             return None
 241     return n.attrib[key]
 242
 243
 244 def get_element_by_id(id, html):
 245     """Return the content of the tag with the specified ID in the passed HTML document"""
 246     return get_element_by_attribute('id', id, html)
 247
 248
 249 def get_element_by_attribute(attribute, value, html):
 250     """Return the content of the tag with the specified attribute in the passed HTML document"""
 251
 252     m = re.search(r'''(?xs)
 253         <([a-zA-Z0-9:._-]+)
 254          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 255          \s+%s=['"]?%s['"]?
 256          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
 257         \s*>
 258         (?P<content>.*?)
 259         </\1>
 260     ''' % (re.escape(attribute), re.escape(value)), html)
 261
 262     if not m:
 263         return None
 264     res = m.group('content')
 265
 266     if res.startswith('"') or res.startswith("'"):
 267         res = res[1:-1]
 268
 269     return unescapeHTML(res)
 270
 271
 272 class HTMLAttributeParser(compat_HTMLParser):
 273     """Trivial HTML parser to gather the attributes for a single element"""
 274     def __init__(self):
 275         self.attrs = {}
 276         compat_HTMLParser.__init__(self)
 277
 278     def handle_starttag(self, tag, attrs):
 279         self.attrs = dict(attrs)
 280
 281
 282 def extract_attributes(html_element):
 283     """Given a string for an HTML element such as
 284     <el
 285          a="foo" B="bar" c="&98;az" d=boz
 286          empty= noval entity="&amp;"
 287          sq='"' dq="'"
 288     >
 289     Decode and return a dictionary of attributes.
 290     {
 291         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
 292         'empty': '', 'noval': None, 'entity': '&',
 293         'sq': '"', 'dq': '\''
 294     }.
 295     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
 296     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
 297     """
 298     parser = HTMLAttributeParser()
 299     parser.feed(html_element)
 300     parser.close()
 301     return parser.attrs
 302
 303
 304 def clean_html(html):
 305     """Clean an HTML snippet into a readable string"""
 306
 307     if html is None:  # Convenience for sanitizing descriptions etc.
 308         return html
 309
 310     # Newline vs <br />
 311     html = html.replace('\n', ' ')
 312     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 313     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 314     # Strip html tags
 315     html = re.sub('<.*?>', '', html)
 316     # Replace html entities
 317     html = unescapeHTML(html)
 318     return html.strip()
 319
 320
 321 def sanitize_open(filename, open_mode):
 322     """Try to open the given filename, and slightly tweak it if this fails.
 323
 324     Attempts to open the given filename. If this fails, it tries to change
 325     the filename slightly, step by step, until it's either able to open it
 326     or it fails and raises a final exception, like the standard open()
 327     function.
 328
 329     It returns the tuple (stream, definitive_file_name).
 330     """
 331     try:
 332         if filename == '-':
 333             if sys.platform == 'win32':
 334                 import msvcrt
 335                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 336             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 337         stream = open(encodeFilename(filename), open_mode)
 338         return (stream, filename)
 339     except (IOError, OSError) as err:
 340         if err.errno in (errno.EACCES,):
 341             raise
 342
 343         # In case of error, try to remove win32 forbidden chars
 344         alt_filename = sanitize_path(filename)
 345         if alt_filename == filename:
 346             raise
 347         else:
 348             # An exception here should be caught in the caller
 349             stream = open(encodeFilename(alt_filename), open_mode)
 350             return (stream, alt_filename)
 351
 352
 353 def timeconvert(timestr):
 354     """Convert RFC 2822 defined time string into system timestamp"""
 355     timestamp = None
 356     timetuple = email.utils.parsedate_tz(timestr)
 357     if timetuple is not None:
 358         timestamp = email.utils.mktime_tz(timetuple)
 359     return timestamp
 360
 361
 362 def sanitize_filename(s, restricted=False, is_id=False):
 363     """Sanitizes a string so it could be used as part of a filename.
 364     If restricted is set, use a stricter subset of allowed characters.
 365     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 366     """
 367     def replace_insane(char):
 368         if char == '?' or ord(char) < 32 or ord(char) == 127:
 369             return ''
 370         elif char == '"':
 371             return '' if restricted else '\''
 372         elif char == ':':
 373             return '_-' if restricted else ' -'
 374         elif char in '\\/|*<>':
 375             return '_'
 376         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 377             return '_'
 378         if restricted and ord(char) > 127:
 379             return '_'
 380         return char
 381
 382     # Handle timestamps
 383     s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
 384     result = ''.join(map(replace_insane, s))
 385     if not is_id:
 386         while '__' in result:
 387             result = result.replace('__', '_')
 388         result = result.strip('_')
 389         # Common case of "Foreign band name - English song title"
 390         if restricted and result.startswith('-_'):
 391             result = result[2:]
 392         if result.startswith('-'):
 393             result = '_' + result[len('-'):]
 394         result = result.lstrip('.')
 395         if not result:
 396             result = '_'
 397     return result
 398
 399
 400 def sanitize_path(s):
 401     """Sanitizes and normalizes path on Windows"""
 402     if sys.platform != 'win32':
 403         return s
 404     drive_or_unc, _ = os.path.splitdrive(s)
 405     if sys.version_info < (2, 7) and not drive_or_unc:
 406         drive_or_unc, _ = os.path.splitunc(s)
 407     norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
 408     if drive_or_unc:
 409         norm_path.pop(0)
 410     sanitized_path = [
 411         path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
 412         for path_part in norm_path]
 413     if drive_or_unc:
 414         sanitized_path.insert(0, drive_or_unc + os.path.sep)
 415     return os.path.join(*sanitized_path)
 416
 417
 418 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
 419 # unwanted failures due to missing protocol
 420 def sanitize_url(url):
 421     return 'http:%s' % url if url.startswith('//') else url
 422
 423
 424 def sanitized_Request(url, *args, **kwargs):
 425     return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
 426
 427
 428 def orderedSet(iterable):
 429     """ Remove all duplicates from the input iterable """
 430     res = []
 431     for el in iterable:
 432         if el not in res:
 433             res.append(el)
 434     return res
 435
 436
 437 def _htmlentity_transform(entity):
 438     """Transforms an HTML entity to a character."""
 439     # Known non-numeric HTML entity
 440     if entity in compat_html_entities.name2codepoint:
 441         return compat_chr(compat_html_entities.name2codepoint[entity])
 442
 443     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
 444     if mobj is not None:
 445         numstr = mobj.group(1)
 446         if numstr.startswith('x'):
 447             base = 16
 448             numstr = '0%s' % numstr
 449         else:
 450             base = 10
 451         # See https://github.com/rg3/youtube-dl/issues/7518
 452         try:
 453             return compat_chr(int(numstr, base))
 454         except ValueError:
 455             pass
 456
 457     # Unknown entity in name, return its literal representation
 458     return '&%s;' % entity
 459
 460
 461 def unescapeHTML(s):
 462     if s is None:
 463         return None
 464     assert type(s) == compat_str
 465
 466     return re.sub(
 467         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 468
 469
 470 def get_subprocess_encoding():
 471     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 472         # For subprocess calls, encode with locale encoding
 473         # Refer to http://stackoverflow.com/a/9951851/35070
 474         encoding = preferredencoding()
 475     else:
 476         encoding = sys.getfilesystemencoding()
 477     if encoding is None:
 478         encoding = 'utf-8'
 479     return encoding
 480
 481
 482 def encodeFilename(s, for_subprocess=False):
 483     """
 484     @param s The name of the file
 485     """
 486
 487     assert type(s) == compat_str
 488
 489     # Python 3 has a Unicode API
 490     if sys.version_info >= (3, 0):
 491         return s
 492
 493     # Pass '' directly to use Unicode APIs on Windows 2000 and up
 494     # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 495     # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 496     if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 497         return s
 498
 499     # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
 500     if sys.platform.startswith('java'):
 501         return s
 502
 503     return s.encode(get_subprocess_encoding(), 'ignore')
 504
 505
 506 def decodeFilename(b, for_subprocess=False):
 507
 508     if sys.version_info >= (3, 0):
 509         return b
 510
 511     if not isinstance(b, bytes):
 512         return b
 513
 514     return b.decode(get_subprocess_encoding(), 'ignore')
 515
 516
 517 def encodeArgument(s):
 518     if not isinstance(s, compat_str):
 519         # Legacy code that uses byte strings
 520         # Uncomment the following line after fixing all post processors
 521         # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 522         s = s.decode('ascii')
 523     return encodeFilename(s, True)
 524
 525
 526 def decodeArgument(b):
 527     return decodeFilename(b, True)
 528
 529
 530 def decodeOption(optval):
 531     if optval is None:
 532         return optval
 533     if isinstance(optval, bytes):
 534         optval = optval.decode(preferredencoding())
 535
 536     assert isinstance(optval, compat_str)
 537     return optval
 538
 539
 540 def formatSeconds(secs):
 541     if secs > 3600:
 542         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 543     elif secs > 60:
 544         return '%d:%02d' % (secs // 60, secs % 60)
 545     else:
 546         return '%d' % secs
 547
 548
 549 def make_HTTPS_handler(params, **kwargs):
 550     opts_no_check_certificate = params.get('nocheckcertificate', False)
 551     if hasattr(ssl, 'create_default_context'):  # Python >= 3.4 or 2.7.9
 552         context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
 553         if opts_no_check_certificate:
 554             context.check_hostname = False
 555             context.verify_mode = ssl.CERT_NONE
 556         try:
 557             return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 558         except TypeError:
 559             # Python 2.7.8
 560             # (create_default_context present but HTTPSHandler has no context=)
 561             pass
 562
 563     if sys.version_info < (3, 2):
 564         return YoutubeDLHTTPSHandler(params, **kwargs)
 565     else:  # Python < 3.4
 566         context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
 567         context.verify_mode = (ssl.CERT_NONE
 568                                if opts_no_check_certificate
 569                                else ssl.CERT_REQUIRED)
 570         context.set_default_verify_paths()
 571         return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
 572
 573
 574 def bug_reports_message():
 575     if ytdl_is_updateable():
 576         update_cmd = 'type  youtube-dl -U  to update'
 577     else:
 578         update_cmd = 'see  https://yt-dl.org/update  on how to update'
 579     msg = '; please report this issue on https://yt-dl.org/bug .'
 580     msg += ' Make sure you are using the latest version; %s.' % update_cmd
 581     msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
 582     return msg
 583
 584
 585 class ExtractorError(Exception):
 586     """Error during info extraction."""
 587
 588     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 589         """ tb, if given, is the original traceback (so that it can be printed out).
 590         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 591         """
 592
 593         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 594             expected = True
 595         if video_id is not None:
 596             msg = video_id + ': ' + msg
 597         if cause:
 598             msg += ' (caused by %r)' % cause
 599         if not expected:
 600             msg += bug_reports_message()
 601         super(ExtractorError, self).__init__(msg)
 602
 603         self.traceback = tb
 604         self.exc_info = sys.exc_info()  # preserve original exception
 605         self.cause = cause
 606         self.video_id = video_id
 607
 608     def format_traceback(self):
 609         if self.traceback is None:
 610             return None
 611         return ''.join(traceback.format_tb(self.traceback))
 612
 613
 614 class UnsupportedError(ExtractorError):
 615     def __init__(self, url):
 616         super(UnsupportedError, self).__init__(
 617             'Unsupported URL: %s' % url, expected=True)
 618         self.url = url
 619
 620
 621 class RegexNotFoundError(ExtractorError):
 622     """Error when a regex didn't match"""
 623     pass
 624
 625
 626 class DownloadError(Exception):
 627     """Download Error exception.
 628
 629     This exception may be thrown by FileDownloader objects if they are not
 630     configured to continue on errors. They will contain the appropriate
 631     error message.
 632     """
 633
 634     def __init__(self, msg, exc_info=None):
 635         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 636         super(DownloadError, self).__init__(msg)
 637         self.exc_info = exc_info
 638
 639
 640 class SameFileError(Exception):
 641     """Same File exception.
 642
 643     This exception will be thrown by FileDownloader objects if they detect
 644     multiple files would have to be downloaded to the same file on disk.
 645     """
 646     pass
 647
 648
 649 class PostProcessingError(Exception):
 650     """Post Processing exception.
 651
 652     This exception may be raised by PostProcessor's .run() method to
 653     indicate an error in the postprocessing task.
 654     """
 655
 656     def __init__(self, msg):
 657         self.msg = msg
 658
 659
 660 class MaxDownloadsReached(Exception):
 661     """ --max-downloads limit has been reached. """
 662     pass
 663
 664
 665 class UnavailableVideoError(Exception):
 666     """Unavailable Format exception.
 667
 668     This exception will be thrown when a video is requested
 669     in a format that is not available for that video.
 670     """
 671     pass
 672
 673
 674 class ContentTooShortError(Exception):
 675     """Content Too Short exception.
 676
 677     This exception may be raised by FileDownloader objects when a file they
 678     download is too small for what the server announced first, indicating
 679     the connection was probably interrupted.
 680     """
 681
 682     def __init__(self, downloaded, expected):
 683         # Both in bytes
 684         self.downloaded = downloaded
 685         self.expected = expected
 686
 687
 688 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
 689     # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
 690     # expected HTTP responses to meet HTTP/1.0 or later (see also
 691     # https://github.com/rg3/youtube-dl/issues/6727)
 692     if sys.version_info < (3, 0):
 693         kwargs[b'strict'] = True
 694     hc = http_class(*args, **kwargs)
 695     source_address = ydl_handler._params.get('source_address')
 696     if source_address is not None:
 697         sa = (source_address, 0)
 698         if hasattr(hc, 'source_address'):  # Python 2.7+
 699             hc.source_address = sa
 700         else:  # Python 2.6
 701             def _hc_connect(self, *args, **kwargs):
 702                 sock = compat_socket_create_connection(
 703                     (self.host, self.port), self.timeout, sa)
 704                 if is_https:
 705                     self.sock = ssl.wrap_socket(
 706                         sock, self.key_file, self.cert_file,
 707                         ssl_version=ssl.PROTOCOL_TLSv1)
 708                 else:
 709                     self.sock = sock
 710             hc.connect = functools.partial(_hc_connect, hc)
 711
 712     return hc
 713
 714
 715 def handle_youtubedl_headers(headers):
 716     filtered_headers = headers
 717
 718     if 'Youtubedl-no-compression' in filtered_headers:
 719         filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
 720         del filtered_headers['Youtubedl-no-compression']
 721
 722     return filtered_headers
 723
 724
 725 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 726     """Handler for HTTP requests and responses.
 727
 728     This class, when installed with an OpenerDirector, automatically adds
 729     the standard headers to every HTTP request and handles gzipped and
 730     deflated responses from web servers. If compression is to be avoided in
 731     a particular request, the original request in the program code only has
 732     to include the HTTP header "Youtubedl-no-compression", which will be
 733     removed before making the real request.
 734
 735     Part of this code was copied from:
 736
 737     http://techknack.net/python-urllib2-handlers/
 738
 739     Andrew Rowls, the author of that code, agreed to release it to the
 740     public domain.
 741     """
 742
 743     def __init__(self, params, *args, **kwargs):
 744         compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
 745         self._params = params
 746
 747     def http_open(self, req):
 748         return self.do_open(functools.partial(
 749             _create_http_connection, self, compat_http_client.HTTPConnection, False),
 750             req)
 751
 752     @staticmethod
 753     def deflate(data):
 754         try:
 755             return zlib.decompress(data, -zlib.MAX_WBITS)
 756         except zlib.error:
 757             return zlib.decompress(data)
 758
 759     @staticmethod
 760     def addinfourl_wrapper(stream, headers, url, code):
 761         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 762             return compat_urllib_request.addinfourl(stream, headers, url, code)
 763         ret = compat_urllib_request.addinfourl(stream, headers, url)
 764         ret.code = code
 765         return ret
 766
 767     def http_request(self, req):
 768         # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
 769         # always respected by websites, some tend to give out URLs with non percent-encoded
 770         # non-ASCII characters (see telemb.py, ard.py [#3412])
 771         # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
 772         # To work around aforementioned issue we will replace request's original URL with
 773         # percent-encoded one
 774         # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
 775         # the code of this workaround has been moved here from YoutubeDL.urlopen()
 776         url = req.get_full_url()
 777         url_escaped = escape_url(url)
 778
 779         # Substitute URL if any change after escaping
 780         if url != url_escaped:
 781             req = update_Request(req, url=url_escaped)
 782
 783         for h, v in std_headers.items():
 784             # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
 785             # The dict keys are capitalized because of this bug by urllib
 786             if h.capitalize() not in req.headers:
 787                 req.add_header(h, v)
 788
 789         req.headers = handle_youtubedl_headers(req.headers)
 790
 791         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 792             # Python 2.6 is brain-dead when it comes to fragments
 793             req._Request__original = req._Request__original.partition('#')[0]
 794             req._Request__r_type = req._Request__r_type.partition('#')[0]
 795
 796         return req
 797
 798     def http_response(self, req, resp):
 799         old_resp = resp
 800         # gzip
 801         if resp.headers.get('Content-encoding', '') == 'gzip':
 802             content = resp.read()
 803             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 804             try:
 805                 uncompressed = io.BytesIO(gz.read())
 806             except IOError as original_ioerror:
 807                 # There may be junk add the end of the file
 808                 # See http://stackoverflow.com/q/4928560/35070 for details
 809                 for i in range(1, 1024):
 810                     try:
 811                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 812                         uncompressed = io.BytesIO(gz.read())
 813                     except IOError:
 814                         continue
 815                     break
 816                 else:
 817                     raise original_ioerror
 818             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 819             resp.msg = old_resp.msg
 820             del resp.headers['Content-encoding']
 821         # deflate
 822         if resp.headers.get('Content-encoding', '') == 'deflate':
 823             gz = io.BytesIO(self.deflate(resp.read()))
 824             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 825             resp.msg = old_resp.msg
 826             del resp.headers['Content-encoding']
 827         # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
 828         # https://github.com/rg3/youtube-dl/issues/6457).
 829         if 300 <= resp.code < 400:
 830             location = resp.headers.get('Location')
 831             if location:
 832                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
 833                 if sys.version_info >= (3, 0):
 834                     location = location.encode('iso-8859-1').decode('utf-8')
 835                 location_escaped = escape_url(location)
 836                 if location != location_escaped:
 837                     del resp.headers['Location']
 838                     resp.headers['Location'] = location_escaped
 839         return resp
 840
 841     https_request = http_request
 842     https_response = http_response
 843
 844
 845 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 846     def __init__(self, params, https_conn_class=None, *args, **kwargs):
 847         compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
 848         self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
 849         self._params = params
 850
 851     def https_open(self, req):
 852         kwargs = {}
 853         if hasattr(self, '_context'):  # python > 2.6
 854             kwargs['context'] = self._context
 855         if hasattr(self, '_check_hostname'):  # python 3.x
 856             kwargs['check_hostname'] = self._check_hostname
 857         return self.do_open(functools.partial(
 858             _create_http_connection, self, self._https_conn_class, True),
 859             req, **kwargs)
 860
 861
 862 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
 863     def __init__(self, cookiejar=None):
 864         compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
 865
 866     def http_response(self, request, response):
 867         # Python 2 will choke on next HTTP request in row if there are non-ASCII
 868         # characters in Set-Cookie HTTP header of last response (see
 869         # https://github.com/rg3/youtube-dl/issues/6769).
 870         # In order to at least prevent crashing we will percent encode Set-Cookie
 871         # header before HTTPCookieProcessor starts processing it.
 872         # if sys.version_info < (3, 0) and response.headers:
 873         #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
 874         #         set_cookie = response.headers.get(set_cookie_header)
 875         #         if set_cookie:
 876         #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
 877         #             if set_cookie != set_cookie_escaped:
 878         #                 del response.headers[set_cookie_header]
 879         #                 response.headers[set_cookie_header] = set_cookie_escaped
 880         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 881
 882     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
 883     https_response = http_response
 884
 885
 886 def parse_iso8601(date_str, delimiter='T', timezone=None):
 887     """ Return a UNIX timestamp from the given date """
 888
 889     if date_str is None:
 890         return None
 891
 892     date_str = re.sub(r'\.[0-9]+', '', date_str)
 893
 894     if timezone is None:
 895         m = re.search(
 896             r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 897             date_str)
 898         if not m:
 899             timezone = datetime.timedelta()
 900         else:
 901             date_str = date_str[:-len(m.group(0))]
 902             if not m.group('sign'):
 903                 timezone = datetime.timedelta()
 904             else:
 905                 sign = 1 if m.group('sign') == '+' else -1
 906                 timezone = datetime.timedelta(
 907                     hours=sign * int(m.group('hours')),
 908                     minutes=sign * int(m.group('minutes')))
 909     try:
 910         date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 911         dt = datetime.datetime.strptime(date_str, date_format) - timezone
 912         return calendar.timegm(dt.timetuple())
 913     except ValueError:
 914         pass
 915
 916
 917 def unified_strdate(date_str, day_first=True):
 918     """Return a string with the date in the format YYYYMMDD"""
 919
 920     if date_str is None:
 921         return None
 922     upload_date = None
 923     # Replace commas
 924     date_str = date_str.replace(',', ' ')
 925     # %z (UTC offset) is only supported in python>=3.2
 926     if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
 927         date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 928     # Remove AM/PM + timezone
 929     date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
 930
 931     format_expressions = [
 932         '%d %B %Y',
 933         '%d %b %Y',
 934         '%B %d %Y',
 935         '%b %d %Y',
 936         '%b %dst %Y %I:%M',
 937         '%b %dnd %Y %I:%M',
 938         '%b %dth %Y %I:%M',
 939         '%Y %m %d',
 940         '%Y-%m-%d',
 941         '%Y/%m/%d',
 942         '%Y/%m/%d %H:%M:%S',
 943         '%Y-%m-%d %H:%M:%S',
 944         '%Y-%m-%d %H:%M:%S.%f',
 945         '%d.%m.%Y %H:%M',
 946         '%d.%m.%Y %H.%M',
 947         '%Y-%m-%dT%H:%M:%SZ',
 948         '%Y-%m-%dT%H:%M:%S.%fZ',
 949         '%Y-%m-%dT%H:%M:%S.%f0Z',
 950         '%Y-%m-%dT%H:%M:%S',
 951         '%Y-%m-%dT%H:%M:%S.%f',
 952         '%Y-%m-%dT%H:%M',
 953     ]
 954     if day_first:
 955         format_expressions.extend([
 956             '%d-%m-%Y',
 957             '%d.%m.%Y',
 958             '%d/%m/%Y',
 959             '%d/%m/%y',
 960             '%d/%m/%Y %H:%M:%S',
 961         ])
 962     else:
 963         format_expressions.extend([
 964             '%m-%d-%Y',
 965             '%m.%d.%Y',
 966             '%m/%d/%Y',
 967             '%m/%d/%y',
 968             '%m/%d/%Y %H:%M:%S',
 969         ])
 970     for expression in format_expressions:
 971         try:
 972             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 973         except ValueError:
 974             pass
 975     if upload_date is None:
 976         timetuple = email.utils.parsedate_tz(date_str)
 977         if timetuple:
 978             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 979     if upload_date is not None:
 980         return compat_str(upload_date)
 981
 982
 983 def determine_ext(url, default_ext='unknown_video'):
 984     if url is None:
 985         return default_ext
 986     guess = url.partition('?')[0].rpartition('.')[2]
 987     if re.match(r'^[A-Za-z0-9]+$', guess):
 988         return guess
 989     # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
 990     elif guess.rstrip('/') in KNOWN_EXTENSIONS:
 991         return guess.rstrip('/')
 992     else:
 993         return default_ext
 994
 995
 996 def subtitles_filename(filename, sub_lang, sub_format):
 997     return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
 998
 999
1000 def date_from_str(date_str):
1001     """
1002     Return a datetime object from a string in the format YYYYMMDD or
1003     (now|today)[+-][0-9](day|week|month|year)(s)?"""
1004     today = datetime.date.today()
1005     if date_str in ('now', 'today'):
1006         return today
1007     if date_str == 'yesterday':
1008         return today - datetime.timedelta(days=1)
1009     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1010     if match is not None:
1011         sign = match.group('sign')
1012         time = int(match.group('time'))
1013         if sign == '-':
1014             time = -time
1015         unit = match.group('unit')
1016         # A bad approximation?
1017         if unit == 'month':
1018             unit = 'day'
1019             time *= 30
1020         elif unit == 'year':
1021             unit = 'day'
1022             time *= 365
1023         unit += 's'
1024         delta = datetime.timedelta(**{unit: time})
1025         return today + delta
1026     return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1027
1028
1029 def hyphenate_date(date_str):
1030     """
1031     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1032     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1033     if match is not None:
1034         return '-'.join(match.groups())
1035     else:
1036         return date_str
1037
1038
1039 class DateRange(object):
1040     """Represents a time interval between two dates"""
1041
1042     def __init__(self, start=None, end=None):
1043         """start and end must be strings in the format accepted by date"""
1044         if start is not None:
1045             self.start = date_from_str(start)
1046         else:
1047             self.start = datetime.datetime.min.date()
1048         if end is not None:
1049             self.end = date_from_str(end)
1050         else:
1051             self.end = datetime.datetime.max.date()
1052         if self.start > self.end:
1053             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1054
1055     @classmethod
1056     def day(cls, day):
1057         """Returns a range that only contains the given day"""
1058         return cls(day, day)
1059
1060     def __contains__(self, date):
1061         """Check if the date is in the range"""
1062         if not isinstance(date, datetime.date):
1063             date = date_from_str(date)
1064         return self.start <= date <= self.end
1065
1066     def __str__(self):
1067         return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1068
1069
1070 def platform_name():
1071     """ Returns the platform name as a compat_str """
1072     res = platform.platform()
1073     if isinstance(res, bytes):
1074         res = res.decode(preferredencoding())
1075
1076     assert isinstance(res, compat_str)
1077     return res
1078
1079
1080 def _windows_write_string(s, out):
1081     """ Returns True if the string was written using special methods,
1082     False if it has yet to be written out."""
1083     # Adapted from http://stackoverflow.com/a/3259271/35070
1084
1085     import ctypes
1086     import ctypes.wintypes
1087
1088     WIN_OUTPUT_IDS = {
1089         1: -11,
1090         2: -12,
1091     }
1092
1093     try:
1094         fileno = out.fileno()
1095     except AttributeError:
1096         # If the output stream doesn't have a fileno, it's virtual
1097         return False
1098     except io.UnsupportedOperation:
1099         # Some strange Windows pseudo files?
1100         return False
1101     if fileno not in WIN_OUTPUT_IDS:
1102         return False
1103
1104     GetStdHandle = ctypes.WINFUNCTYPE(
1105         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1106         (b'GetStdHandle', ctypes.windll.kernel32))
1107     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1108
1109     WriteConsoleW = ctypes.WINFUNCTYPE(
1110         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1111         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1112         ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1113     written = ctypes.wintypes.DWORD(0)
1114
1115     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1116     FILE_TYPE_CHAR = 0x0002
1117     FILE_TYPE_REMOTE = 0x8000
1118     GetConsoleMode = ctypes.WINFUNCTYPE(
1119         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1120         ctypes.POINTER(ctypes.wintypes.DWORD))(
1121         (b'GetConsoleMode', ctypes.windll.kernel32))
1122     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1123
1124     def not_a_console(handle):
1125         if handle == INVALID_HANDLE_VALUE or handle is None:
1126             return True
1127         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1128                 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1129
1130     if not_a_console(h):
1131         return False
1132
1133     def next_nonbmp_pos(s):
1134         try:
1135             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1136         except StopIteration:
1137             return len(s)
1138
1139     while s:
1140         count = min(next_nonbmp_pos(s), 1024)
1141
1142         ret = WriteConsoleW(
1143             h, s, count if count else 2, ctypes.byref(written), None)
1144         if ret == 0:
1145             raise OSError('Failed to write string')
1146         if not count:  # We just wrote a non-BMP character
1147             assert written.value == 2
1148             s = s[1:]
1149         else:
1150             assert written.value > 0
1151             s = s[written.value:]
1152     return True
1153
1154
1155 def write_string(s, out=None, encoding=None):
1156     if out is None:
1157         out = sys.stderr
1158     assert type(s) == compat_str
1159
1160     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1161         if _windows_write_string(s, out):
1162             return
1163
1164     if ('b' in getattr(out, 'mode', '') or
1165             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1166         byt = s.encode(encoding or preferredencoding(), 'ignore')
1167         out.write(byt)
1168     elif hasattr(out, 'buffer'):
1169         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1170         byt = s.encode(enc, 'ignore')
1171         out.buffer.write(byt)
1172     else:
1173         out.write(s)
1174     out.flush()
1175
1176
1177 def bytes_to_intlist(bs):
1178     if not bs:
1179         return []
1180     if isinstance(bs[0], int):  # Python 3
1181         return list(bs)
1182     else:
1183         return [ord(c) for c in bs]
1184
1185
1186 def intlist_to_bytes(xs):
1187     if not xs:
1188         return b''
1189     return struct_pack('%dB' % len(xs), *xs)
1190
1191
1192 # Cross-platform file locking
1193 if sys.platform == 'win32':
1194     import ctypes.wintypes
1195     import msvcrt
1196
1197     class OVERLAPPED(ctypes.Structure):
1198         _fields_ = [
1199             ('Internal', ctypes.wintypes.LPVOID),
1200             ('InternalHigh', ctypes.wintypes.LPVOID),
1201             ('Offset', ctypes.wintypes.DWORD),
1202             ('OffsetHigh', ctypes.wintypes.DWORD),
1203             ('hEvent', ctypes.wintypes.HANDLE),
1204         ]
1205
1206     kernel32 = ctypes.windll.kernel32
1207     LockFileEx = kernel32.LockFileEx
1208     LockFileEx.argtypes = [
1209         ctypes.wintypes.HANDLE,     # hFile
1210         ctypes.wintypes.DWORD,      # dwFlags
1211         ctypes.wintypes.DWORD,      # dwReserved
1212         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1213         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1214         ctypes.POINTER(OVERLAPPED)  # Overlapped
1215     ]
1216     LockFileEx.restype = ctypes.wintypes.BOOL
1217     UnlockFileEx = kernel32.UnlockFileEx
1218     UnlockFileEx.argtypes = [
1219         ctypes.wintypes.HANDLE,     # hFile
1220         ctypes.wintypes.DWORD,      # dwReserved
1221         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1222         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1223         ctypes.POINTER(OVERLAPPED)  # Overlapped
1224     ]
1225     UnlockFileEx.restype = ctypes.wintypes.BOOL
1226     whole_low = 0xffffffff
1227     whole_high = 0x7fffffff
1228
1229     def _lock_file(f, exclusive):
1230         overlapped = OVERLAPPED()
1231         overlapped.Offset = 0
1232         overlapped.OffsetHigh = 0
1233         overlapped.hEvent = 0
1234         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1235         handle = msvcrt.get_osfhandle(f.fileno())
1236         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1237                           whole_low, whole_high, f._lock_file_overlapped_p):
1238             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1239
1240     def _unlock_file(f):
1241         assert f._lock_file_overlapped_p
1242         handle = msvcrt.get_osfhandle(f.fileno())
1243         if not UnlockFileEx(handle, 0,
1244                             whole_low, whole_high, f._lock_file_overlapped_p):
1245             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1246
1247 else:
1248     # Some platforms, such as Jython, is missing fcntl
1249     try:
1250         import fcntl
1251
1252         def _lock_file(f, exclusive):
1253             fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1254
1255         def _unlock_file(f):
1256             fcntl.flock(f, fcntl.LOCK_UN)
1257     except ImportError:
1258         UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1259
1260         def _lock_file(f, exclusive):
1261             raise IOError(UNSUPPORTED_MSG)
1262
1263         def _unlock_file(f):
1264             raise IOError(UNSUPPORTED_MSG)
1265
1266
1267 class locked_file(object):
1268     def __init__(self, filename, mode, encoding=None):
1269         assert mode in ['r', 'a', 'w']
1270         self.f = io.open(filename, mode, encoding=encoding)
1271         self.mode = mode
1272
1273     def __enter__(self):
1274         exclusive = self.mode != 'r'
1275         try:
1276             _lock_file(self.f, exclusive)
1277         except IOError:
1278             self.f.close()
1279             raise
1280         return self
1281
1282     def __exit__(self, etype, value, traceback):
1283         try:
1284             _unlock_file(self.f)
1285         finally:
1286             self.f.close()
1287
1288     def __iter__(self):
1289         return iter(self.f)
1290
1291     def write(self, *args):
1292         return self.f.write(*args)
1293
1294     def read(self, *args):
1295         return self.f.read(*args)
1296
1297
1298 def get_filesystem_encoding():
1299     encoding = sys.getfilesystemencoding()
1300     return encoding if encoding is not None else 'utf-8'
1301
1302
1303 def shell_quote(args):
1304     quoted_args = []
1305     encoding = get_filesystem_encoding()
1306     for a in args:
1307         if isinstance(a, bytes):
1308             # We may get a filename encoded with 'encodeFilename'
1309             a = a.decode(encoding)
1310         quoted_args.append(pipes.quote(a))
1311     return ' '.join(quoted_args)
1312
1313
1314 def smuggle_url(url, data):
1315     """ Pass additional data in a URL for internal use. """
1316
1317     sdata = compat_urllib_parse_urlencode(
1318         {'__youtubedl_smuggle': json.dumps(data)})
1319     return url + '#' + sdata
1320
1321
1322 def unsmuggle_url(smug_url, default=None):
1323     if '#__youtubedl_smuggle' not in smug_url:
1324         return smug_url, default
1325     url, _, sdata = smug_url.rpartition('#')
1326     jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1327     data = json.loads(jsond)
1328     return url, data
1329
1330
1331 def format_bytes(bytes):
1332     if bytes is None:
1333         return 'N/A'
1334     if type(bytes) is str:
1335         bytes = float(bytes)
1336     if bytes == 0.0:
1337         exponent = 0
1338     else:
1339         exponent = int(math.log(bytes, 1024.0))
1340     suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1341     converted = float(bytes) / float(1024 ** exponent)
1342     return '%.2f%s' % (converted, suffix)
1343
1344
1345 def lookup_unit_table(unit_table, s):
1346     units_re = '|'.join(re.escape(u) for u in unit_table)
1347     m = re.match(
1348         r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1349     if not m:
1350         return None
1351     num_str = m.group('num').replace(',', '.')
1352     mult = unit_table[m.group('unit')]
1353     return int(float(num_str) * mult)
1354
1355
1356 def parse_filesize(s):
1357     if s is None:
1358         return None
1359
1360     # The lower-case forms are of course incorrect and unofficial,
1361     # but we support those too
1362     _UNIT_TABLE = {
1363         'B': 1,
1364         'b': 1,
1365         'KiB': 1024,
1366         'KB': 1000,
1367         'kB': 1024,
1368         'Kb': 1000,
1369         'MiB': 1024 ** 2,
1370         'MB': 1000 ** 2,
1371         'mB': 1024 ** 2,
1372         'Mb': 1000 ** 2,
1373         'GiB': 1024 ** 3,
1374         'GB': 1000 ** 3,
1375         'gB': 1024 ** 3,
1376         'Gb': 1000 ** 3,
1377         'TiB': 1024 ** 4,
1378         'TB': 1000 ** 4,
1379         'tB': 1024 ** 4,
1380         'Tb': 1000 ** 4,
1381         'PiB': 1024 ** 5,
1382         'PB': 1000 ** 5,
1383         'pB': 1024 ** 5,
1384         'Pb': 1000 ** 5,
1385         'EiB': 1024 ** 6,
1386         'EB': 1000 ** 6,
1387         'eB': 1024 ** 6,
1388         'Eb': 1000 ** 6,
1389         'ZiB': 1024 ** 7,
1390         'ZB': 1000 ** 7,
1391         'zB': 1024 ** 7,
1392         'Zb': 1000 ** 7,
1393         'YiB': 1024 ** 8,
1394         'YB': 1000 ** 8,
1395         'yB': 1024 ** 8,
1396         'Yb': 1000 ** 8,
1397     }
1398
1399     return lookup_unit_table(_UNIT_TABLE, s)
1400
1401
1402 def parse_count(s):
1403     if s is None:
1404         return None
1405
1406     s = s.strip()
1407
1408     if re.match(r'^[\d,.]+$', s):
1409         return str_to_int(s)
1410
1411     _UNIT_TABLE = {
1412         'k': 1000,
1413         'K': 1000,
1414         'm': 1000 ** 2,
1415         'M': 1000 ** 2,
1416         'kk': 1000 ** 2,
1417         'KK': 1000 ** 2,
1418     }
1419
1420     return lookup_unit_table(_UNIT_TABLE, s)
1421
1422
1423 def month_by_name(name):
1424     """ Return the number of a month by (locale-independently) English name """
1425
1426     try:
1427         return ENGLISH_MONTH_NAMES.index(name) + 1
1428     except ValueError:
1429         return None
1430
1431
1432 def month_by_abbreviation(abbrev):
1433     """ Return the number of a month by (locale-independently) English
1434         abbreviations """
1435
1436     try:
1437         return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1438     except ValueError:
1439         return None
1440
1441
1442 def fix_xml_ampersands(xml_str):
1443     """Replace all the '&' by '&amp;' in XML"""
1444     return re.sub(
1445         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1446         '&amp;',
1447         xml_str)
1448
1449
1450 def setproctitle(title):
1451     assert isinstance(title, compat_str)
1452
1453     # ctypes in Jython is not complete
1454     # http://bugs.jython.org/issue2148
1455     if sys.platform.startswith('java'):
1456         return
1457
1458     try:
1459         libc = ctypes.cdll.LoadLibrary('libc.so.6')
1460     except OSError:
1461         return
1462     title_bytes = title.encode('utf-8')
1463     buf = ctypes.create_string_buffer(len(title_bytes))
1464     buf.value = title_bytes
1465     try:
1466         libc.prctl(15, buf, 0, 0, 0)
1467     except AttributeError:
1468         return  # Strange libc, just skip this
1469
1470
1471 def remove_start(s, start):
1472     if s.startswith(start):
1473         return s[len(start):]
1474     return s
1475
1476
1477 def remove_end(s, end):
1478     if s.endswith(end):
1479         return s[:-len(end)]
1480     return s
1481
1482
1483 def remove_quotes(s):
1484     if s is None or len(s) < 2:
1485         return s
1486     for quote in ('"', "'", ):
1487         if s[0] == quote and s[-1] == quote:
1488             return s[1:-1]
1489     return s
1490
1491
1492 def url_basename(url):
1493     path = compat_urlparse.urlparse(url).path
1494     return path.strip('/').split('/')[-1]
1495
1496
1497 class HEADRequest(compat_urllib_request.Request):
1498     def get_method(self):
1499         return 'HEAD'
1500
1501
1502 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1503     if get_attr:
1504         if v is not None:
1505             v = getattr(v, get_attr, None)
1506     if v == '':
1507         v = None
1508     if v is None:
1509         return default
1510     try:
1511         return int(v) * invscale // scale
1512     except ValueError:
1513         return default
1514
1515
1516 def str_or_none(v, default=None):
1517     return default if v is None else compat_str(v)
1518
1519
1520 def str_to_int(int_str):
1521     """ A more relaxed version of int_or_none """
1522     if int_str is None:
1523         return None
1524     int_str = re.sub(r'[,\.\+]', '', int_str)
1525     return int(int_str)
1526
1527
1528 def float_or_none(v, scale=1, invscale=1, default=None):
1529     if v is None:
1530         return default
1531     try:
1532         return float(v) * invscale / scale
1533     except ValueError:
1534         return default
1535
1536
1537 def parse_duration(s):
1538     if not isinstance(s, compat_basestring):
1539         return None
1540
1541     s = s.strip()
1542
1543     days, hours, mins, secs, ms = [None] * 5
1544     m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1545     if m:
1546         days, hours, mins, secs, ms = m.groups()
1547     else:
1548         m = re.match(
1549             r'''(?ix)(?:P?T)?
1550                 (?:
1551                     (?P<days>[0-9]+)\s*d(?:ays?)?\s*
1552                 )?
1553                 (?:
1554                     (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1555                 )?
1556                 (?:
1557                     (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1558                 )?
1559                 (?:
1560                     (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1561                 )?$''', s)
1562         if m:
1563             days, hours, mins, secs, ms = m.groups()
1564         else:
1565             m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1566             if m:
1567                 hours, mins = m.groups()
1568             else:
1569                 return None
1570
1571     duration = 0
1572     if secs:
1573         duration += float(secs)
1574     if mins:
1575         duration += float(mins) * 60
1576     if hours:
1577         duration += float(hours) * 60 * 60
1578     if days:
1579         duration += float(days) * 24 * 60 * 60
1580     if ms:
1581         duration += float(ms)
1582     return duration
1583
1584
1585 def prepend_extension(filename, ext, expected_real_ext=None):
1586     name, real_ext = os.path.splitext(filename)
1587     return (
1588         '{0}.{1}{2}'.format(name, ext, real_ext)
1589         if not expected_real_ext or real_ext[1:] == expected_real_ext
1590         else '{0}.{1}'.format(filename, ext))
1591
1592
1593 def replace_extension(filename, ext, expected_real_ext=None):
1594     name, real_ext = os.path.splitext(filename)
1595     return '{0}.{1}'.format(
1596         name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1597         ext)
1598
1599
1600 def check_executable(exe, args=[]):
1601     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1602     args can be a list of arguments for a short output (like -version) """
1603     try:
1604         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1605     except OSError:
1606         return False
1607     return exe
1608
1609
1610 def get_exe_version(exe, args=['--version'],
1611                     version_re=None, unrecognized='present'):
1612     """ Returns the version of the specified executable,
1613     or False if the executable is not present """
1614     try:
1615         out, _ = subprocess.Popen(
1616             [encodeArgument(exe)] + args,
1617             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1618     except OSError:
1619         return False
1620     if isinstance(out, bytes):  # Python 2.x
1621         out = out.decode('ascii', 'ignore')
1622     return detect_exe_version(out, version_re, unrecognized)
1623
1624
1625 def detect_exe_version(output, version_re=None, unrecognized='present'):
1626     assert isinstance(output, compat_str)
1627     if version_re is None:
1628         version_re = r'version\s+([-0-9._a-zA-Z]+)'
1629     m = re.search(version_re, output)
1630     if m:
1631         return m.group(1)
1632     else:
1633         return unrecognized
1634
1635
1636 class PagedList(object):
1637     def __len__(self):
1638         # This is only useful for tests
1639         return len(self.getslice())
1640
1641
1642 class OnDemandPagedList(PagedList):
1643     def __init__(self, pagefunc, pagesize, use_cache=False):
1644         self._pagefunc = pagefunc
1645         self._pagesize = pagesize
1646         self._use_cache = use_cache
1647         if use_cache:
1648             self._cache = {}
1649
1650     def getslice(self, start=0, end=None):
1651         res = []
1652         for pagenum in itertools.count(start // self._pagesize):
1653             firstid = pagenum * self._pagesize
1654             nextfirstid = pagenum * self._pagesize + self._pagesize
1655             if start >= nextfirstid:
1656                 continue
1657
1658             page_results = None
1659             if self._use_cache:
1660                 page_results = self._cache.get(pagenum)
1661             if page_results is None:
1662                 page_results = list(self._pagefunc(pagenum))
1663             if self._use_cache:
1664                 self._cache[pagenum] = page_results
1665
1666             startv = (
1667                 start % self._pagesize
1668                 if firstid <= start < nextfirstid
1669                 else 0)
1670
1671             endv = (
1672                 ((end - 1) % self._pagesize) + 1
1673                 if (end is not None and firstid <= end <= nextfirstid)
1674                 else None)
1675
1676             if startv != 0 or endv is not None:
1677                 page_results = page_results[startv:endv]
1678             res.extend(page_results)
1679
1680             # A little optimization - if current page is not "full", ie. does
1681             # not contain page_size videos then we can assume that this page
1682             # is the last one - there are no more ids on further pages -
1683             # i.e. no need to query again.
1684             if len(page_results) + startv < self._pagesize:
1685                 break
1686
1687             # If we got the whole page, but the next page is not interesting,
1688             # break out early as well
1689             if end == nextfirstid:
1690                 break
1691         return res
1692
1693
1694 class InAdvancePagedList(PagedList):
1695     def __init__(self, pagefunc, pagecount, pagesize):
1696         self._pagefunc = pagefunc
1697         self._pagecount = pagecount
1698         self._pagesize = pagesize
1699
1700     def getslice(self, start=0, end=None):
1701         res = []
1702         start_page = start // self._pagesize
1703         end_page = (
1704             self._pagecount if end is None else (end // self._pagesize + 1))
1705         skip_elems = start - start_page * self._pagesize
1706         only_more = None if end is None else end - start
1707         for pagenum in range(start_page, end_page):
1708             page = list(self._pagefunc(pagenum))
1709             if skip_elems:
1710                 page = page[skip_elems:]
1711                 skip_elems = None
1712             if only_more is not None:
1713                 if len(page) < only_more:
1714                     only_more -= len(page)
1715                 else:
1716                     page = page[:only_more]
1717                     res.extend(page)
1718                     break
1719             res.extend(page)
1720         return res
1721
1722
1723 def uppercase_escape(s):
1724     unicode_escape = codecs.getdecoder('unicode_escape')
1725     return re.sub(
1726         r'\\U[0-9a-fA-F]{8}',
1727         lambda m: unicode_escape(m.group(0))[0],
1728         s)
1729
1730
1731 def lowercase_escape(s):
1732     unicode_escape = codecs.getdecoder('unicode_escape')
1733     return re.sub(
1734         r'\\u[0-9a-fA-F]{4}',
1735         lambda m: unicode_escape(m.group(0))[0],
1736         s)
1737
1738
1739 def escape_rfc3986(s):
1740     """Escape non-ASCII characters as suggested by RFC 3986"""
1741     if sys.version_info < (3, 0) and isinstance(s, compat_str):
1742         s = s.encode('utf-8')
1743     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1744
1745
1746 def escape_url(url):
1747     """Escape URL as suggested by RFC 3986"""
1748     url_parsed = compat_urllib_parse_urlparse(url)
1749     return url_parsed._replace(
1750         netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1751         path=escape_rfc3986(url_parsed.path),
1752         params=escape_rfc3986(url_parsed.params),
1753         query=escape_rfc3986(url_parsed.query),
1754         fragment=escape_rfc3986(url_parsed.fragment)
1755     ).geturl()
1756
1757 try:
1758     struct.pack('!I', 0)
1759 except TypeError:
1760     # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1761     # See https://bugs.python.org/issue19099
1762     def struct_pack(spec, *args):
1763         if isinstance(spec, compat_str):
1764             spec = spec.encode('ascii')
1765         return struct.pack(spec, *args)
1766
1767     def struct_unpack(spec, *args):
1768         if isinstance(spec, compat_str):
1769             spec = spec.encode('ascii')
1770         return struct.unpack(spec, *args)
1771 else:
1772     struct_pack = struct.pack
1773     struct_unpack = struct.unpack
1774
1775
1776 def read_batch_urls(batch_fd):
1777     def fixup(url):
1778         if not isinstance(url, compat_str):
1779             url = url.decode('utf-8', 'replace')
1780         BOM_UTF8 = '\xef\xbb\xbf'
1781         if url.startswith(BOM_UTF8):
1782             url = url[len(BOM_UTF8):]
1783         url = url.strip()
1784         if url.startswith(('#', ';', ']')):
1785             return False
1786         return url
1787
1788     with contextlib.closing(batch_fd) as fd:
1789         return [url for url in map(fixup, fd) if url]
1790
1791
1792 def urlencode_postdata(*args, **kargs):
1793     return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
1794
1795
1796 def update_url_query(url, query):
1797     if not query:
1798         return url
1799     parsed_url = compat_urlparse.urlparse(url)
1800     qs = compat_parse_qs(parsed_url.query)
1801     qs.update(query)
1802     return compat_urlparse.urlunparse(parsed_url._replace(
1803         query=compat_urllib_parse_urlencode(qs, True)))
1804
1805
1806 def update_Request(req, url=None, data=None, headers={}, query={}):
1807     req_headers = req.headers.copy()
1808     req_headers.update(headers)
1809     req_data = data or req.data
1810     req_url = update_url_query(url or req.get_full_url(), query)
1811     req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1812     new_req = req_type(
1813         req_url, data=req_data, headers=req_headers,
1814         origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1815     if hasattr(req, 'timeout'):
1816         new_req.timeout = req.timeout
1817     return new_req
1818
1819
1820 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1821     if isinstance(key_or_keys, (list, tuple)):
1822         for key in key_or_keys:
1823             if key not in d or d[key] is None or skip_false_values and not d[key]:
1824                 continue
1825             return d[key]
1826         return default
1827     return d.get(key_or_keys, default)
1828
1829
1830 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1831     return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1832
1833
1834 US_RATINGS = {
1835     'G': 0,
1836     'PG': 10,
1837     'PG-13': 13,
1838     'R': 16,
1839     'NC': 18,
1840 }
1841
1842
1843 def parse_age_limit(s):
1844     if s is None:
1845         return None
1846     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1847     return int(m.group('age')) if m else US_RATINGS.get(s)
1848
1849
1850 def strip_jsonp(code):
1851     return re.sub(
1852         r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1853
1854
1855 def js_to_json(code):
1856     def fix_kv(m):
1857         v = m.group(0)
1858         if v in ('true', 'false', 'null'):
1859             return v
1860         if v.startswith('"'):
1861             v = re.sub(r"\\'", "'", v[1:-1])
1862         elif v.startswith("'"):
1863             v = v[1:-1]
1864             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1865                 '\\\\': '\\\\',
1866                 "\\'": "'",
1867                 '"': '\\"',
1868             }[m.group(0)], v)
1869         return '"%s"' % v
1870
1871     res = re.sub(r'''(?x)
1872         "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1873         '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1874         [a-zA-Z_][.a-zA-Z_0-9]*
1875         ''', fix_kv, code)
1876     res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1877     return res
1878
1879
1880 def qualities(quality_ids):
1881     """ Get a numeric quality value out of a list of possible values """
1882     def q(qid):
1883         try:
1884             return quality_ids.index(qid)
1885         except ValueError:
1886             return -1
1887     return q
1888
1889
1890 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1891
1892
1893 def limit_length(s, length):
1894     """ Add ellipses to overly long strings """
1895     if s is None:
1896         return None
1897     ELLIPSES = '...'
1898     if len(s) > length:
1899         return s[:length - len(ELLIPSES)] + ELLIPSES
1900     return s
1901
1902
1903 def version_tuple(v):
1904     return tuple(int(e) for e in re.split(r'[-.]', v))
1905
1906
1907 def is_outdated_version(version, limit, assume_new=True):
1908     if not version:
1909         return not assume_new
1910     try:
1911         return version_tuple(version) < version_tuple(limit)
1912     except ValueError:
1913         return not assume_new
1914
1915
1916 def ytdl_is_updateable():
1917     """ Returns if youtube-dl can be updated with -U """
1918     from zipimport import zipimporter
1919
1920     return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1921
1922
1923 def args_to_str(args):
1924     # Get a short string representation for a subprocess command
1925     return ' '.join(shlex_quote(a) for a in args)
1926
1927
1928 def error_to_compat_str(err):
1929     err_str = str(err)
1930     # On python 2 error byte string must be decoded with proper
1931     # encoding rather than ascii
1932     if sys.version_info[0] < 3:
1933         err_str = err_str.decode(preferredencoding())
1934     return err_str
1935
1936
1937 def mimetype2ext(mt):
1938     if mt is None:
1939         return None
1940
1941     ext = {
1942         'audio/mp4': 'm4a',
1943     }.get(mt)
1944     if ext is not None:
1945         return ext
1946
1947     _, _, res = mt.rpartition('/')
1948
1949     return {
1950         '3gpp': '3gp',
1951         'smptett+xml': 'tt',
1952         'srt': 'srt',
1953         'ttaf+xml': 'dfxp',
1954         'ttml+xml': 'ttml',
1955         'vtt': 'vtt',
1956         'x-flv': 'flv',
1957         'x-mp4-fragmented': 'mp4',
1958         'x-ms-wmv': 'wmv',
1959     }.get(res, res)
1960
1961
1962 def urlhandle_detect_ext(url_handle):
1963     try:
1964         url_handle.headers
1965         getheader = lambda h: url_handle.headers[h]
1966     except AttributeError:  # Python < 3
1967         getheader = url_handle.info().getheader
1968
1969     cd = getheader('Content-Disposition')
1970     if cd:
1971         m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1972         if m:
1973             e = determine_ext(m.group('filename'), default_ext=None)
1974             if e:
1975                 return e
1976
1977     return mimetype2ext(getheader('Content-Type'))
1978
1979
1980 def encode_data_uri(data, mime_type):
1981     return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1982
1983
1984 def age_restricted(content_limit, age_limit):
1985     """ Returns True iff the content should be blocked """
1986
1987     if age_limit is None:  # No limit set
1988         return False
1989     if content_limit is None:
1990         return False  # Content available for everyone
1991     return age_limit < content_limit
1992
1993
1994 def is_html(first_bytes):
1995     """ Detect whether a file contains HTML by examining its first bytes. """
1996
1997     BOMS = [
1998         (b'\xef\xbb\xbf', 'utf-8'),
1999         (b'\x00\x00\xfe\xff', 'utf-32-be'),
2000         (b'\xff\xfe\x00\x00', 'utf-32-le'),
2001         (b'\xff\xfe', 'utf-16-le'),
2002         (b'\xfe\xff', 'utf-16-be'),
2003     ]
2004     for bom, enc in BOMS:
2005         if first_bytes.startswith(bom):
2006             s = first_bytes[len(bom):].decode(enc, 'replace')
2007             break
2008     else:
2009         s = first_bytes.decode('utf-8', 'replace')
2010
2011     return re.match(r'^\s*<', s)
2012
2013
2014 def determine_protocol(info_dict):
2015     protocol = info_dict.get('protocol')
2016     if protocol is not None:
2017         return protocol
2018
2019     url = info_dict['url']
2020     if url.startswith('rtmp'):
2021         return 'rtmp'
2022     elif url.startswith('mms'):
2023         return 'mms'
2024     elif url.startswith('rtsp'):
2025         return 'rtsp'
2026
2027     ext = determine_ext(url)
2028     if ext == 'm3u8':
2029         return 'm3u8'
2030     elif ext == 'f4m':
2031         return 'f4m'
2032
2033     return compat_urllib_parse_urlparse(url).scheme
2034
2035
2036 def render_table(header_row, data):
2037     """ Render a list of rows, each as a list of values """
2038     table = [header_row] + data
2039     max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2040     format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2041     return '\n'.join(format_str % tuple(row) for row in table)
2042
2043
2044 def _match_one(filter_part, dct):
2045     COMPARISON_OPERATORS = {
2046         '<': operator.lt,
2047         '<=': operator.le,
2048         '>': operator.gt,
2049         '>=': operator.ge,
2050         '=': operator.eq,
2051         '!=': operator.ne,
2052     }
2053     operator_rex = re.compile(r'''(?x)\s*
2054         (?P<key>[a-z_]+)
2055         \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2056         (?:
2057             (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2058             (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2059         )
2060         \s*$
2061         ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2062     m = operator_rex.search(filter_part)
2063     if m:
2064         op = COMPARISON_OPERATORS[m.group('op')]
2065         if m.group('strval') is not None:
2066             if m.group('op') not in ('=', '!='):
2067                 raise ValueError(
2068                     'Operator %s does not support string values!' % m.group('op'))
2069             comparison_value = m.group('strval')
2070         else:
2071             try:
2072                 comparison_value = int(m.group('intval'))
2073             except ValueError:
2074                 comparison_value = parse_filesize(m.group('intval'))
2075                 if comparison_value is None:
2076                     comparison_value = parse_filesize(m.group('intval') + 'B')
2077                 if comparison_value is None:
2078                     raise ValueError(
2079                         'Invalid integer value %r in filter part %r' % (
2080                             m.group('intval'), filter_part))
2081         actual_value = dct.get(m.group('key'))
2082         if actual_value is None:
2083             return m.group('none_inclusive')
2084         return op(actual_value, comparison_value)
2085
2086     UNARY_OPERATORS = {
2087         '': lambda v: v is not None,
2088         '!': lambda v: v is None,
2089     }
2090     operator_rex = re.compile(r'''(?x)\s*
2091         (?P<op>%s)\s*(?P<key>[a-z_]+)
2092         \s*$
2093         ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2094     m = operator_rex.search(filter_part)
2095     if m:
2096         op = UNARY_OPERATORS[m.group('op')]
2097         actual_value = dct.get(m.group('key'))
2098         return op(actual_value)
2099
2100     raise ValueError('Invalid filter part %r' % filter_part)
2101
2102
2103 def match_str(filter_str, dct):
2104     """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2105
2106     return all(
2107         _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2108
2109
2110 def match_filter_func(filter_str):
2111     def _match_func(info_dict):
2112         if match_str(filter_str, info_dict):
2113             return None
2114         else:
2115             video_title = info_dict.get('title', info_dict.get('id', 'video'))
2116             return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2117     return _match_func
2118
2119
2120 def parse_dfxp_time_expr(time_expr):
2121     if not time_expr:
2122         return
2123
2124     mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2125     if mobj:
2126         return float(mobj.group('time_offset'))
2127
2128     mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2129     if mobj:
2130         return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2131
2132
2133 def srt_subtitles_timecode(seconds):
2134     return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2135
2136
2137 def dfxp2srt(dfxp_data):
2138     _x = functools.partial(xpath_with_ns, ns_map={
2139         'ttml': 'http://www.w3.org/ns/ttml',
2140         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2141         'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
2142     })
2143
2144     class TTMLPElementParser(object):
2145         out = ''
2146
2147         def start(self, tag, attrib):
2148             if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2149                 self.out += '\n'
2150
2151         def end(self, tag):
2152             pass
2153
2154         def data(self, data):
2155             self.out += data
2156
2157         def close(self):
2158             return self.out.strip()
2159
2160     def parse_node(node):
2161         target = TTMLPElementParser()
2162         parser = xml.etree.ElementTree.XMLParser(target=target)
2163         parser.feed(xml.etree.ElementTree.tostring(node))
2164         return parser.close()
2165
2166     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2167     out = []
2168     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
2169
2170     if not paras:
2171         raise ValueError('Invalid dfxp/TTML subtitle')
2172
2173     for para, index in zip(paras, itertools.count(1)):
2174         begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2175         end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2176         dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2177         if begin_time is None:
2178             continue
2179         if not end_time:
2180             if not dur:
2181                 continue
2182             end_time = begin_time + dur
2183         out.append('%d\n%s --> %s\n%s\n\n' % (
2184             index,
2185             srt_subtitles_timecode(begin_time),
2186             srt_subtitles_timecode(end_time),
2187             parse_node(para)))
2188
2189     return ''.join(out)
2190
2191
2192 def cli_option(params, command_option, param):
2193     param = params.get(param)
2194     return [command_option, param] if param is not None else []
2195
2196
2197 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2198     param = params.get(param)
2199     assert isinstance(param, bool)
2200     if separator:
2201         return [command_option + separator + (true_value if param else false_value)]
2202     return [command_option, true_value if param else false_value]
2203
2204
2205 def cli_valueless_option(params, command_option, param, expected_value=True):
2206     param = params.get(param)
2207     return [command_option] if param == expected_value else []
2208
2209
2210 def cli_configuration_args(params, param, default=[]):
2211     ex_args = params.get(param)
2212     if ex_args is None:
2213         return default
2214     assert isinstance(ex_args, list)
2215     return ex_args
2216
2217
2218 class ISO639Utils(object):
2219     # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2220     _lang_map = {
2221         'aa': 'aar',
2222         'ab': 'abk',
2223         'ae': 'ave',
2224         'af': 'afr',
2225         'ak': 'aka',
2226         'am': 'amh',
2227         'an': 'arg',
2228         'ar': 'ara',
2229         'as': 'asm',
2230         'av': 'ava',
2231         'ay': 'aym',
2232         'az': 'aze',
2233         'ba': 'bak',
2234         'be': 'bel',
2235         'bg': 'bul',
2236         'bh': 'bih',
2237         'bi': 'bis',
2238         'bm': 'bam',
2239         'bn': 'ben',
2240         'bo': 'bod',
2241         'br': 'bre',
2242         'bs': 'bos',
2243         'ca': 'cat',
2244         'ce': 'che',
2245         'ch': 'cha',
2246         'co': 'cos',
2247         'cr': 'cre',
2248         'cs': 'ces',
2249         'cu': 'chu',
2250         'cv': 'chv',
2251         'cy': 'cym',
2252         'da': 'dan',
2253         'de': 'deu',
2254         'dv': 'div',
2255         'dz': 'dzo',
2256         'ee': 'ewe',
2257         'el': 'ell',
2258         'en': 'eng',
2259         'eo': 'epo',
2260         'es': 'spa',
2261         'et': 'est',
2262         'eu': 'eus',
2263         'fa': 'fas',
2264         'ff': 'ful',
2265         'fi': 'fin',
2266         'fj': 'fij',
2267         'fo': 'fao',
2268         'fr': 'fra',
2269         'fy': 'fry',
2270         'ga': 'gle',
2271         'gd': 'gla',
2272         'gl': 'glg',
2273         'gn': 'grn',
2274         'gu': 'guj',
2275         'gv': 'glv',
2276         'ha': 'hau',
2277         'he': 'heb',
2278         'hi': 'hin',
2279         'ho': 'hmo',
2280         'hr': 'hrv',
2281         'ht': 'hat',
2282         'hu': 'hun',
2283         'hy': 'hye',
2284         'hz': 'her',
2285         'ia': 'ina',
2286         'id': 'ind',
2287         'ie': 'ile',
2288         'ig': 'ibo',
2289         'ii': 'iii',
2290         'ik': 'ipk',
2291         'io': 'ido',
2292         'is': 'isl',
2293         'it': 'ita',
2294         'iu': 'iku',
2295         'ja': 'jpn',
2296         'jv': 'jav',
2297         'ka': 'kat',
2298         'kg': 'kon',
2299         'ki': 'kik',
2300         'kj': 'kua',
2301         'kk': 'kaz',
2302         'kl': 'kal',
2303         'km': 'khm',
2304         'kn': 'kan',
2305         'ko': 'kor',
2306         'kr': 'kau',
2307         'ks': 'kas',
2308         'ku': 'kur',
2309         'kv': 'kom',
2310         'kw': 'cor',
2311         'ky': 'kir',
2312         'la': 'lat',
2313         'lb': 'ltz',
2314         'lg': 'lug',
2315         'li': 'lim',
2316         'ln': 'lin',
2317         'lo': 'lao',
2318         'lt': 'lit',
2319         'lu': 'lub',
2320         'lv': 'lav',
2321         'mg': 'mlg',
2322         'mh': 'mah',
2323         'mi': 'mri',
2324         'mk': 'mkd',
2325         'ml': 'mal',
2326         'mn': 'mon',
2327         'mr': 'mar',
2328         'ms': 'msa',
2329         'mt': 'mlt',
2330         'my': 'mya',
2331         'na': 'nau',
2332         'nb': 'nob',
2333         'nd': 'nde',
2334         'ne': 'nep',
2335         'ng': 'ndo',
2336         'nl': 'nld',
2337         'nn': 'nno',
2338         'no': 'nor',
2339         'nr': 'nbl',
2340         'nv': 'nav',
2341         'ny': 'nya',
2342         'oc': 'oci',
2343         'oj': 'oji',
2344         'om': 'orm',
2345         'or': 'ori',
2346         'os': 'oss',
2347         'pa': 'pan',
2348         'pi': 'pli',
2349         'pl': 'pol',
2350         'ps': 'pus',
2351         'pt': 'por',
2352         'qu': 'que',
2353         'rm': 'roh',
2354         'rn': 'run',
2355         'ro': 'ron',
2356         'ru': 'rus',
2357         'rw': 'kin',
2358         'sa': 'san',
2359         'sc': 'srd',
2360         'sd': 'snd',
2361         'se': 'sme',
2362         'sg': 'sag',
2363         'si': 'sin',
2364         'sk': 'slk',
2365         'sl': 'slv',
2366         'sm': 'smo',
2367         'sn': 'sna',
2368         'so': 'som',
2369         'sq': 'sqi',
2370         'sr': 'srp',
2371         'ss': 'ssw',
2372         'st': 'sot',
2373         'su': 'sun',
2374         'sv': 'swe',
2375         'sw': 'swa',
2376         'ta': 'tam',
2377         'te': 'tel',
2378         'tg': 'tgk',
2379         'th': 'tha',
2380         'ti': 'tir',
2381         'tk': 'tuk',
2382         'tl': 'tgl',
2383         'tn': 'tsn',
2384         'to': 'ton',
2385         'tr': 'tur',
2386         'ts': 'tso',
2387         'tt': 'tat',
2388         'tw': 'twi',
2389         'ty': 'tah',
2390         'ug': 'uig',
2391         'uk': 'ukr',
2392         'ur': 'urd',
2393         'uz': 'uzb',
2394         've': 'ven',
2395         'vi': 'vie',
2396         'vo': 'vol',
2397         'wa': 'wln',
2398         'wo': 'wol',
2399         'xh': 'xho',
2400         'yi': 'yid',
2401         'yo': 'yor',
2402         'za': 'zha',
2403         'zh': 'zho',
2404         'zu': 'zul',
2405     }
2406
2407     @classmethod
2408     def short2long(cls, code):
2409         """Convert language code from ISO 639-1 to ISO 639-2/T"""
2410         return cls._lang_map.get(code[:2])
2411
2412     @classmethod
2413     def long2short(cls, code):
2414         """Convert language code from ISO 639-2/T to ISO 639-1"""
2415         for short_name, long_name in cls._lang_map.items():
2416             if long_name == code:
2417                 return short_name
2418
2419
2420 class ISO3166Utils(object):
2421     # From http://data.okfn.org/data/core/country-list
2422     _country_map = {
2423         'AF': 'Afghanistan',
2424         'AX': 'Åland Islands',
2425         'AL': 'Albania',
2426         'DZ': 'Algeria',
2427         'AS': 'American Samoa',
2428         'AD': 'Andorra',
2429         'AO': 'Angola',
2430         'AI': 'Anguilla',
2431         'AQ': 'Antarctica',
2432         'AG': 'Antigua and Barbuda',
2433         'AR': 'Argentina',
2434         'AM': 'Armenia',
2435         'AW': 'Aruba',
2436         'AU': 'Australia',
2437         'AT': 'Austria',
2438         'AZ': 'Azerbaijan',
2439         'BS': 'Bahamas',
2440         'BH': 'Bahrain',
2441         'BD': 'Bangladesh',
2442         'BB': 'Barbados',
2443         'BY': 'Belarus',
2444         'BE': 'Belgium',
2445         'BZ': 'Belize',
2446         'BJ': 'Benin',
2447         'BM': 'Bermuda',
2448         'BT': 'Bhutan',
2449         'BO': 'Bolivia, Plurinational State of',
2450         'BQ': 'Bonaire, Sint Eustatius and Saba',
2451         'BA': 'Bosnia and Herzegovina',
2452         'BW': 'Botswana',
2453         'BV': 'Bouvet Island',
2454         'BR': 'Brazil',
2455         'IO': 'British Indian Ocean Territory',
2456         'BN': 'Brunei Darussalam',
2457         'BG': 'Bulgaria',
2458         'BF': 'Burkina Faso',
2459         'BI': 'Burundi',
2460         'KH': 'Cambodia',
2461         'CM': 'Cameroon',
2462         'CA': 'Canada',
2463         'CV': 'Cape Verde',
2464         'KY': 'Cayman Islands',
2465         'CF': 'Central African Republic',
2466         'TD': 'Chad',
2467         'CL': 'Chile',
2468         'CN': 'China',
2469         'CX': 'Christmas Island',
2470         'CC': 'Cocos (Keeling) Islands',
2471         'CO': 'Colombia',
2472         'KM': 'Comoros',
2473         'CG': 'Congo',
2474         'CD': 'Congo, the Democratic Republic of the',
2475         'CK': 'Cook Islands',
2476         'CR': 'Costa Rica',
2477         'CI': 'Côte d\'Ivoire',
2478         'HR': 'Croatia',
2479         'CU': 'Cuba',
2480         'CW': 'Curaçao',
2481         'CY': 'Cyprus',
2482         'CZ': 'Czech Republic',
2483         'DK': 'Denmark',
2484         'DJ': 'Djibouti',
2485         'DM': 'Dominica',
2486         'DO': 'Dominican Republic',
2487         'EC': 'Ecuador',
2488         'EG': 'Egypt',
2489         'SV': 'El Salvador',
2490         'GQ': 'Equatorial Guinea',
2491         'ER': 'Eritrea',
2492         'EE': 'Estonia',
2493         'ET': 'Ethiopia',
2494         'FK': 'Falkland Islands (Malvinas)',
2495         'FO': 'Faroe Islands',
2496         'FJ': 'Fiji',
2497         'FI': 'Finland',
2498         'FR': 'France',
2499         'GF': 'French Guiana',
2500         'PF': 'French Polynesia',
2501         'TF': 'French Southern Territories',
2502         'GA': 'Gabon',
2503         'GM': 'Gambia',
2504         'GE': 'Georgia',
2505         'DE': 'Germany',
2506         'GH': 'Ghana',
2507         'GI': 'Gibraltar',
2508         'GR': 'Greece',
2509         'GL': 'Greenland',
2510         'GD': 'Grenada',
2511         'GP': 'Guadeloupe',
2512         'GU': 'Guam',
2513         'GT': 'Guatemala',
2514         'GG': 'Guernsey',
2515         'GN': 'Guinea',
2516         'GW': 'Guinea-Bissau',
2517         'GY': 'Guyana',
2518         'HT': 'Haiti',
2519         'HM': 'Heard Island and McDonald Islands',
2520         'VA': 'Holy See (Vatican City State)',
2521         'HN': 'Honduras',
2522         'HK': 'Hong Kong',
2523         'HU': 'Hungary',
2524         'IS': 'Iceland',
2525         'IN': 'India',
2526         'ID': 'Indonesia',
2527         'IR': 'Iran, Islamic Republic of',
2528         'IQ': 'Iraq',
2529         'IE': 'Ireland',
2530         'IM': 'Isle of Man',
2531         'IL': 'Israel',
2532         'IT': 'Italy',
2533         'JM': 'Jamaica',
2534         'JP': 'Japan',
2535         'JE': 'Jersey',
2536         'JO': 'Jordan',
2537         'KZ': 'Kazakhstan',
2538         'KE': 'Kenya',
2539         'KI': 'Kiribati',
2540         'KP': 'Korea, Democratic People\'s Republic of',
2541         'KR': 'Korea, Republic of',
2542         'KW': 'Kuwait',
2543         'KG': 'Kyrgyzstan',
2544         'LA': 'Lao People\'s Democratic Republic',
2545         'LV': 'Latvia',
2546         'LB': 'Lebanon',
2547         'LS': 'Lesotho',
2548         'LR': 'Liberia',
2549         'LY': 'Libya',
2550         'LI': 'Liechtenstein',
2551         'LT': 'Lithuania',
2552         'LU': 'Luxembourg',
2553         'MO': 'Macao',
2554         'MK': 'Macedonia, the Former Yugoslav Republic of',
2555         'MG': 'Madagascar',
2556         'MW': 'Malawi',
2557         'MY': 'Malaysia',
2558         'MV': 'Maldives',
2559         'ML': 'Mali',
2560         'MT': 'Malta',
2561         'MH': 'Marshall Islands',
2562         'MQ': 'Martinique',
2563         'MR': 'Mauritania',
2564         'MU': 'Mauritius',
2565         'YT': 'Mayotte',
2566         'MX': 'Mexico',
2567         'FM': 'Micronesia, Federated States of',
2568         'MD': 'Moldova, Republic of',
2569         'MC': 'Monaco',
2570         'MN': 'Mongolia',
2571         'ME': 'Montenegro',
2572         'MS': 'Montserrat',
2573         'MA': 'Morocco',
2574         'MZ': 'Mozambique',
2575         'MM': 'Myanmar',
2576         'NA': 'Namibia',
2577         'NR': 'Nauru',
2578         'NP': 'Nepal',
2579         'NL': 'Netherlands',
2580         'NC': 'New Caledonia',
2581         'NZ': 'New Zealand',
2582         'NI': 'Nicaragua',
2583         'NE': 'Niger',
2584         'NG': 'Nigeria',
2585         'NU': 'Niue',
2586         'NF': 'Norfolk Island',
2587         'MP': 'Northern Mariana Islands',
2588         'NO': 'Norway',
2589         'OM': 'Oman',
2590         'PK': 'Pakistan',
2591         'PW': 'Palau',
2592         'PS': 'Palestine, State of',
2593         'PA': 'Panama',
2594         'PG': 'Papua New Guinea',
2595         'PY': 'Paraguay',
2596         'PE': 'Peru',
2597         'PH': 'Philippines',
2598         'PN': 'Pitcairn',
2599         'PL': 'Poland',
2600         'PT': 'Portugal',
2601         'PR': 'Puerto Rico',
2602         'QA': 'Qatar',
2603         'RE': 'Réunion',
2604         'RO': 'Romania',
2605         'RU': 'Russian Federation',
2606         'RW': 'Rwanda',
2607         'BL': 'Saint Barthélemy',
2608         'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2609         'KN': 'Saint Kitts and Nevis',
2610         'LC': 'Saint Lucia',
2611         'MF': 'Saint Martin (French part)',
2612         'PM': 'Saint Pierre and Miquelon',
2613         'VC': 'Saint Vincent and the Grenadines',
2614         'WS': 'Samoa',
2615         'SM': 'San Marino',
2616         'ST': 'Sao Tome and Principe',
2617         'SA': 'Saudi Arabia',
2618         'SN': 'Senegal',
2619         'RS': 'Serbia',
2620         'SC': 'Seychelles',
2621         'SL': 'Sierra Leone',
2622         'SG': 'Singapore',
2623         'SX': 'Sint Maarten (Dutch part)',
2624         'SK': 'Slovakia',
2625         'SI': 'Slovenia',
2626         'SB': 'Solomon Islands',
2627         'SO': 'Somalia',
2628         'ZA': 'South Africa',
2629         'GS': 'South Georgia and the South Sandwich Islands',
2630         'SS': 'South Sudan',
2631         'ES': 'Spain',
2632         'LK': 'Sri Lanka',
2633         'SD': 'Sudan',
2634         'SR': 'Suriname',
2635         'SJ': 'Svalbard and Jan Mayen',
2636         'SZ': 'Swaziland',
2637         'SE': 'Sweden',
2638         'CH': 'Switzerland',
2639         'SY': 'Syrian Arab Republic',
2640         'TW': 'Taiwan, Province of China',
2641         'TJ': 'Tajikistan',
2642         'TZ': 'Tanzania, United Republic of',
2643         'TH': 'Thailand',
2644         'TL': 'Timor-Leste',
2645         'TG': 'Togo',
2646         'TK': 'Tokelau',
2647         'TO': 'Tonga',
2648         'TT': 'Trinidad and Tobago',
2649         'TN': 'Tunisia',
2650         'TR': 'Turkey',
2651         'TM': 'Turkmenistan',
2652         'TC': 'Turks and Caicos Islands',
2653         'TV': 'Tuvalu',
2654         'UG': 'Uganda',
2655         'UA': 'Ukraine',
2656         'AE': 'United Arab Emirates',
2657         'GB': 'United Kingdom',
2658         'US': 'United States',
2659         'UM': 'United States Minor Outlying Islands',
2660         'UY': 'Uruguay',
2661         'UZ': 'Uzbekistan',
2662         'VU': 'Vanuatu',
2663         'VE': 'Venezuela, Bolivarian Republic of',
2664         'VN': 'Viet Nam',
2665         'VG': 'Virgin Islands, British',
2666         'VI': 'Virgin Islands, U.S.',
2667         'WF': 'Wallis and Futuna',
2668         'EH': 'Western Sahara',
2669         'YE': 'Yemen',
2670         'ZM': 'Zambia',
2671         'ZW': 'Zimbabwe',
2672     }
2673
2674     @classmethod
2675     def short2full(cls, code):
2676         """Convert an ISO 3166-2 country code to the corresponding full name"""
2677         return cls._country_map.get(code.upper())
2678
2679
2680 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2681     def __init__(self, proxies=None):
2682         # Set default handlers
2683         for type in ('http', 'https'):
2684             setattr(self, '%s_open' % type,
2685                     lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2686                         meth(r, proxy, type))
2687         return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2688
2689     def proxy_open(self, req, proxy, type):
2690         req_proxy = req.headers.get('Ytdl-request-proxy')
2691         if req_proxy is not None:
2692             proxy = req_proxy
2693             del req.headers['Ytdl-request-proxy']
2694
2695         if proxy == '__noproxy__':
2696             return None  # No Proxy
2697         return compat_urllib_request.ProxyHandler.proxy_open(
2698             self, req, proxy, type)
2699
2700
2701 def ohdave_rsa_encrypt(data, exponent, modulus):
2702     '''
2703     Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2704
2705     Input:
2706         data: data to encrypt, bytes-like object
2707         exponent, modulus: parameter e and N of RSA algorithm, both integer
2708     Output: hex string of encrypted data
2709
2710     Limitation: supports one block encryption only
2711     '''
2712
2713     payload = int(binascii.hexlify(data[::-1]), 16)
2714     encrypted = pow(payload, exponent, modulus)
2715     return '%x' % encrypted
2716
2717
2718 def encode_base_n(num, n, table=None):
2719     FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2720     if not table:
2721         table = FULL_TABLE[:n]
2722
2723     if n > len(table):
2724         raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2725
2726     if num == 0:
2727         return table[0]
2728
2729     ret = ''
2730     while num:
2731         ret = table[num % n] + ret
2732         num = num // n
2733     return ret
2734
2735
2736 def decode_packed_codes(code):
2737     mobj = re.search(
2738         r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2739         code)
2740     obfucasted_code, base, count, symbols = mobj.groups()
2741     base = int(base)
2742     count = int(count)
2743     symbols = symbols.split('|')
2744     symbol_table = {}
2745
2746     while count:
2747         count -= 1
2748         base_n_count = encode_base_n(count, base)
2749         symbol_table[base_n_count] = symbols[count] or base_n_count
2750
2751     return re.sub(
2752         r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2753         obfucasted_code)