youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         for f in node.findall(xpath):
 284             if f.attrib.get(key) == val:
 285                 return f
 286         return None
 287
 288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 289 # the namespace parameter
 290 def xpath_with_ns(path, ns_map):
 291     components = [c.split(':') for c in path.split('/')]
 292     replaced = []
 293     for c in components:
 294         if len(c) == 1:
 295             replaced.append(c[0])
 296         else:
 297             ns, tag = c
 298             replaced.append('{%s}%s' % (ns_map[ns], tag))
 299     return '/'.join(replaced)
 300
 301 def htmlentity_transform(matchobj):
 302     """Transforms an HTML entity to a character.
 303
 304     This function receives a match object and is intended to be used with
 305     the re.sub() function.
 306     """
 307     entity = matchobj.group(1)
 308
 309     # Known non-numeric HTML entity
 310     if entity in compat_html_entities.name2codepoint:
 311         return compat_chr(compat_html_entities.name2codepoint[entity])
 312
 313     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 314     if mobj is not None:
 315         numstr = mobj.group(1)
 316         if numstr.startswith(u'x'):
 317             base = 16
 318             numstr = u'0%s' % numstr
 319         else:
 320             base = 10
 321         return compat_chr(int(numstr, base))
 322
 323     # Unknown entity in name, return its literal representation
 324     return (u'&%s;' % entity)
 325
 326 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 327 class BaseHTMLParser(compat_html_parser.HTMLParser):
 328     def __init(self):
 329         compat_html_parser.HTMLParser.__init__(self)
 330         self.html = None
 331
 332     def loads(self, html):
 333         self.html = html
 334         self.feed(html)
 335         self.close()
 336
 337 class AttrParser(BaseHTMLParser):
 338     """Modified HTMLParser that isolates a tag with the specified attribute"""
 339     def __init__(self, attribute, value):
 340         self.attribute = attribute
 341         self.value = value
 342         self.result = None
 343         self.started = False
 344         self.depth = {}
 345         self.watch_startpos = False
 346         self.error_count = 0
 347         BaseHTMLParser.__init__(self)
 348
 349     def error(self, message):
 350         if self.error_count > 10 or self.started:
 351             raise compat_html_parser.HTMLParseError(message, self.getpos())
 352         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 353         self.error_count += 1
 354         self.goahead(1)
 355
 356     def handle_starttag(self, tag, attrs):
 357         attrs = dict(attrs)
 358         if self.started:
 359             self.find_startpos(None)
 360         if self.attribute in attrs and attrs[self.attribute] == self.value:
 361             self.result = [tag]
 362             self.started = True
 363             self.watch_startpos = True
 364         if self.started:
 365             if not tag in self.depth: self.depth[tag] = 0
 366             self.depth[tag] += 1
 367
 368     def handle_endtag(self, tag):
 369         if self.started:
 370             if tag in self.depth: self.depth[tag] -= 1
 371             if self.depth[self.result[0]] == 0:
 372                 self.started = False
 373                 self.result.append(self.getpos())
 374
 375     def find_startpos(self, x):
 376         """Needed to put the start position of the result (self.result[1])
 377         after the opening tag with the requested id"""
 378         if self.watch_startpos:
 379             self.watch_startpos = False
 380             self.result.append(self.getpos())
 381     handle_entityref = handle_charref = handle_data = handle_comment = \
 382     handle_decl = handle_pi = unknown_decl = find_startpos
 383
 384     def get_result(self):
 385         if self.result is None:
 386             return None
 387         if len(self.result) != 3:
 388             return None
 389         lines = self.html.split('\n')
 390         lines = lines[self.result[1][0]-1:self.result[2][0]]
 391         lines[0] = lines[0][self.result[1][1]:]
 392         if len(lines) == 1:
 393             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 394         lines[-1] = lines[-1][:self.result[2][1]]
 395         return '\n'.join(lines).strip()
 396 # Hack for https://github.com/rg3/youtube-dl/issues/662
 397 if sys.version_info < (2, 7, 3):
 398     AttrParser.parse_endtag = (lambda self, i:
 399         i + len("</scr'+'ipt>")
 400         if self.rawdata[i:].startswith("</scr'+'ipt>")
 401         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 402
 403 def get_element_by_id(id, html):
 404     """Return the content of the tag with the specified ID in the passed HTML document"""
 405     return get_element_by_attribute("id", id, html)
 406
 407 def get_element_by_attribute(attribute, value, html):
 408     """Return the content of the tag with the specified attribute in the passed HTML document"""
 409     parser = AttrParser(attribute, value)
 410     try:
 411         parser.loads(html)
 412     except compat_html_parser.HTMLParseError:
 413         pass
 414     return parser.get_result()
 415
 416 class MetaParser(BaseHTMLParser):
 417     """
 418     Modified HTMLParser that isolates a meta tag with the specified name
 419     attribute.
 420     """
 421     def __init__(self, name):
 422         BaseHTMLParser.__init__(self)
 423         self.name = name
 424         self.content = None
 425         self.result = None
 426
 427     def handle_starttag(self, tag, attrs):
 428         if tag != 'meta':
 429             return
 430         attrs = dict(attrs)
 431         if attrs.get('name') == self.name:
 432             self.result = attrs.get('content')
 433
 434     def get_result(self):
 435         return self.result
 436
 437 def get_meta_content(name, html):
 438     """
 439     Return the content attribute from the meta tag with the given name attribute.
 440     """
 441     parser = MetaParser(name)
 442     try:
 443         parser.loads(html)
 444     except compat_html_parser.HTMLParseError:
 445         pass
 446     return parser.get_result()
 447
 448
 449 def clean_html(html):
 450     """Clean an HTML snippet into a readable string"""
 451     # Newline vs <br />
 452     html = html.replace('\n', ' ')
 453     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 454     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 455     # Strip html tags
 456     html = re.sub('<.*?>', '', html)
 457     # Replace html entities
 458     html = unescapeHTML(html)
 459     return html.strip()
 460
 461
 462 def sanitize_open(filename, open_mode):
 463     """Try to open the given filename, and slightly tweak it if this fails.
 464
 465     Attempts to open the given filename. If this fails, it tries to change
 466     the filename slightly, step by step, until it's either able to open it
 467     or it fails and raises a final exception, like the standard open()
 468     function.
 469
 470     It returns the tuple (stream, definitive_file_name).
 471     """
 472     try:
 473         if filename == u'-':
 474             if sys.platform == 'win32':
 475                 import msvcrt
 476                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 477             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 478         stream = open(encodeFilename(filename), open_mode)
 479         return (stream, filename)
 480     except (IOError, OSError) as err:
 481         if err.errno in (errno.EACCES,):
 482             raise
 483
 484         # In case of error, try to remove win32 forbidden chars
 485         alt_filename = os.path.join(
 486                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 487                         for path_part in os.path.split(filename)
 488                        )
 489         if alt_filename == filename:
 490             raise
 491         else:
 492             # An exception here should be caught in the caller
 493             stream = open(encodeFilename(filename), open_mode)
 494             return (stream, alt_filename)
 495
 496
 497 def timeconvert(timestr):
 498     """Convert RFC 2822 defined time string into system timestamp"""
 499     timestamp = None
 500     timetuple = email.utils.parsedate_tz(timestr)
 501     if timetuple is not None:
 502         timestamp = email.utils.mktime_tz(timetuple)
 503     return timestamp
 504
 505 def sanitize_filename(s, restricted=False, is_id=False):
 506     """Sanitizes a string so it could be used as part of a filename.
 507     If restricted is set, use a stricter subset of allowed characters.
 508     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 509     """
 510     def replace_insane(char):
 511         if char == '?' or ord(char) < 32 or ord(char) == 127:
 512             return ''
 513         elif char == '"':
 514             return '' if restricted else '\''
 515         elif char == ':':
 516             return '_-' if restricted else ' -'
 517         elif char in '\\/|*<>':
 518             return '_'
 519         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 520             return '_'
 521         if restricted and ord(char) > 127:
 522             return '_'
 523         return char
 524
 525     result = u''.join(map(replace_insane, s))
 526     if not is_id:
 527         while '__' in result:
 528             result = result.replace('__', '_')
 529         result = result.strip('_')
 530         # Common case of "Foreign band name - English song title"
 531         if restricted and result.startswith('-_'):
 532             result = result[2:]
 533         if not result:
 534             result = '_'
 535     return result
 536
 537 def orderedSet(iterable):
 538     """ Remove all duplicates from the input iterable """
 539     res = []
 540     for el in iterable:
 541         if el not in res:
 542             res.append(el)
 543     return res
 544
 545
 546 def unescapeHTML(s):
 547     if s is None:
 548         return None
 549     assert type(s) == compat_str
 550
 551     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 552     return result
 553
 554
 555 def encodeFilename(s, for_subprocess=False):
 556     """
 557     @param s The name of the file
 558     """
 559
 560     assert type(s) == compat_str
 561
 562     # Python 3 has a Unicode API
 563     if sys.version_info >= (3, 0):
 564         return s
 565
 566     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 567         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 568         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 569         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 570         if not for_subprocess:
 571             return s
 572         else:
 573             # For subprocess calls, encode with locale encoding
 574             # Refer to http://stackoverflow.com/a/9951851/35070
 575             encoding = preferredencoding()
 576     else:
 577         encoding = sys.getfilesystemencoding()
 578     if encoding is None:
 579         encoding = 'utf-8'
 580     return s.encode(encoding, 'ignore')
 581
 582
 583 def encodeArgument(s):
 584     if not isinstance(s, compat_str):
 585         # Legacy code that uses byte strings
 586         # Uncomment the following line after fixing all post processors
 587         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 588         s = s.decode('ascii')
 589     return encodeFilename(s, True)
 590
 591
 592 def decodeOption(optval):
 593     if optval is None:
 594         return optval
 595     if isinstance(optval, bytes):
 596         optval = optval.decode(preferredencoding())
 597
 598     assert isinstance(optval, compat_str)
 599     return optval
 600
 601 def formatSeconds(secs):
 602     if secs > 3600:
 603         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 604     elif secs > 60:
 605         return '%d:%02d' % (secs // 60, secs % 60)
 606     else:
 607         return '%d' % secs
 608
 609
 610 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 611     if sys.version_info < (3, 2):
 612         import httplib
 613
 614         class HTTPSConnectionV3(httplib.HTTPSConnection):
 615             def __init__(self, *args, **kwargs):
 616                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 617
 618             def connect(self):
 619                 sock = socket.create_connection((self.host, self.port), self.timeout)
 620                 if getattr(self, '_tunnel_host', False):
 621                     self.sock = sock
 622                     self._tunnel()
 623                 try:
 624                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 625                 except ssl.SSLError:
 626                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 627
 628         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 629             def https_open(self, req):
 630                 return self.do_open(HTTPSConnectionV3, req)
 631         return HTTPSHandlerV3(**kwargs)
 632     else:
 633         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 634         context.verify_mode = (ssl.CERT_NONE
 635                                if opts_no_check_certificate
 636                                else ssl.CERT_REQUIRED)
 637         context.set_default_verify_paths()
 638         try:
 639             context.load_default_certs()
 640         except AttributeError:
 641             pass  # Python < 3.4
 642         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 643
 644 class ExtractorError(Exception):
 645     """Error during info extraction."""
 646     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 647         """ tb, if given, is the original traceback (so that it can be printed out).
 648         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 649         """
 650
 651         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 652             expected = True
 653         if video_id is not None:
 654             msg = video_id + ': ' + msg
 655         if not expected:
 656             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 657         super(ExtractorError, self).__init__(msg)
 658
 659         self.traceback = tb
 660         self.exc_info = sys.exc_info()  # preserve original exception
 661         self.cause = cause
 662         self.video_id = video_id
 663
 664     def format_traceback(self):
 665         if self.traceback is None:
 666             return None
 667         return u''.join(traceback.format_tb(self.traceback))
 668
 669
 670 class RegexNotFoundError(ExtractorError):
 671     """Error when a regex didn't match"""
 672     pass
 673
 674
 675 class DownloadError(Exception):
 676     """Download Error exception.
 677
 678     This exception may be thrown by FileDownloader objects if they are not
 679     configured to continue on errors. They will contain the appropriate
 680     error message.
 681     """
 682     def __init__(self, msg, exc_info=None):
 683         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 684         super(DownloadError, self).__init__(msg)
 685         self.exc_info = exc_info
 686
 687
 688 class SameFileError(Exception):
 689     """Same File exception.
 690
 691     This exception will be thrown by FileDownloader objects if they detect
 692     multiple files would have to be downloaded to the same file on disk.
 693     """
 694     pass
 695
 696
 697 class PostProcessingError(Exception):
 698     """Post Processing exception.
 699
 700     This exception may be raised by PostProcessor's .run() method to
 701     indicate an error in the postprocessing task.
 702     """
 703     def __init__(self, msg):
 704         self.msg = msg
 705
 706 class MaxDownloadsReached(Exception):
 707     """ --max-downloads limit has been reached. """
 708     pass
 709
 710
 711 class UnavailableVideoError(Exception):
 712     """Unavailable Format exception.
 713
 714     This exception will be thrown when a video is requested
 715     in a format that is not available for that video.
 716     """
 717     pass
 718
 719
 720 class ContentTooShortError(Exception):
 721     """Content Too Short exception.
 722
 723     This exception may be raised by FileDownloader objects when a file they
 724     download is too small for what the server announced first, indicating
 725     the connection was probably interrupted.
 726     """
 727     # Both in bytes
 728     downloaded = None
 729     expected = None
 730
 731     def __init__(self, downloaded, expected):
 732         self.downloaded = downloaded
 733         self.expected = expected
 734
 735 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 736     """Handler for HTTP requests and responses.
 737
 738     This class, when installed with an OpenerDirector, automatically adds
 739     the standard headers to every HTTP request and handles gzipped and
 740     deflated responses from web servers. If compression is to be avoided in
 741     a particular request, the original request in the program code only has
 742     to include the HTTP header "Youtubedl-No-Compression", which will be
 743     removed before making the real request.
 744
 745     Part of this code was copied from:
 746
 747     http://techknack.net/python-urllib2-handlers/
 748
 749     Andrew Rowls, the author of that code, agreed to release it to the
 750     public domain.
 751     """
 752
 753     @staticmethod
 754     def deflate(data):
 755         try:
 756             return zlib.decompress(data, -zlib.MAX_WBITS)
 757         except zlib.error:
 758             return zlib.decompress(data)
 759
 760     @staticmethod
 761     def addinfourl_wrapper(stream, headers, url, code):
 762         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 763             return compat_urllib_request.addinfourl(stream, headers, url, code)
 764         ret = compat_urllib_request.addinfourl(stream, headers, url)
 765         ret.code = code
 766         return ret
 767
 768     def http_request(self, req):
 769         for h,v in std_headers.items():
 770             if h in req.headers:
 771                 del req.headers[h]
 772             req.add_header(h, v)
 773         if 'Youtubedl-no-compression' in req.headers:
 774             if 'Accept-encoding' in req.headers:
 775                 del req.headers['Accept-encoding']
 776             del req.headers['Youtubedl-no-compression']
 777         if 'Youtubedl-user-agent' in req.headers:
 778             if 'User-agent' in req.headers:
 779                 del req.headers['User-agent']
 780             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 781             del req.headers['Youtubedl-user-agent']
 782         return req
 783
 784     def http_response(self, req, resp):
 785         old_resp = resp
 786         # gzip
 787         if resp.headers.get('Content-encoding', '') == 'gzip':
 788             content = resp.read()
 789             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 790             try:
 791                 uncompressed = io.BytesIO(gz.read())
 792             except IOError as original_ioerror:
 793                 # There may be junk add the end of the file
 794                 # See http://stackoverflow.com/q/4928560/35070 for details
 795                 for i in range(1, 1024):
 796                     try:
 797                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 798                         uncompressed = io.BytesIO(gz.read())
 799                     except IOError:
 800                         continue
 801                     break
 802                 else:
 803                     raise original_ioerror
 804             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 805             resp.msg = old_resp.msg
 806         # deflate
 807         if resp.headers.get('Content-encoding', '') == 'deflate':
 808             gz = io.BytesIO(self.deflate(resp.read()))
 809             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 810             resp.msg = old_resp.msg
 811         return resp
 812
 813     https_request = http_request
 814     https_response = http_response
 815
 816
 817 def parse_iso8601(date_str, delimiter='T'):
 818     """ Return a UNIX timestamp from the given date """
 819
 820     if date_str is None:
 821         return None
 822
 823     m = re.search(
 824         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 825         date_str)
 826     if not m:
 827         timezone = datetime.timedelta()
 828     else:
 829         date_str = date_str[:-len(m.group(0))]
 830         if not m.group('sign'):
 831             timezone = datetime.timedelta()
 832         else:
 833             sign = 1 if m.group('sign') == '+' else -1
 834             timezone = datetime.timedelta(
 835                 hours=sign * int(m.group('hours')),
 836                 minutes=sign * int(m.group('minutes')))
 837     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 838     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 839     return calendar.timegm(dt.timetuple())
 840
 841
 842 def unified_strdate(date_str):
 843     """Return a string with the date in the format YYYYMMDD"""
 844
 845     if date_str is None:
 846         return None
 847
 848     upload_date = None
 849     #Replace commas
 850     date_str = date_str.replace(',', ' ')
 851     # %z (UTC offset) is only supported in python>=3.2
 852     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 853     format_expressions = [
 854         '%d %B %Y',
 855         '%d %b %Y',
 856         '%B %d %Y',
 857         '%b %d %Y',
 858         '%b %dst %Y %I:%M%p',
 859         '%b %dnd %Y %I:%M%p',
 860         '%b %dth %Y %I:%M%p',
 861         '%Y-%m-%d',
 862         '%Y/%m/%d',
 863         '%d.%m.%Y',
 864         '%d/%m/%Y',
 865         '%d/%m/%y',
 866         '%Y/%m/%d %H:%M:%S',
 867         '%Y-%m-%d %H:%M:%S',
 868         '%d.%m.%Y %H:%M',
 869         '%d.%m.%Y %H.%M',
 870         '%Y-%m-%dT%H:%M:%SZ',
 871         '%Y-%m-%dT%H:%M:%S.%fZ',
 872         '%Y-%m-%dT%H:%M:%S.%f0Z',
 873         '%Y-%m-%dT%H:%M:%S',
 874         '%Y-%m-%dT%H:%M:%S.%f',
 875         '%Y-%m-%dT%H:%M',
 876     ]
 877     for expression in format_expressions:
 878         try:
 879             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 880         except ValueError:
 881             pass
 882     if upload_date is None:
 883         timetuple = email.utils.parsedate_tz(date_str)
 884         if timetuple:
 885             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 886     return upload_date
 887
 888 def determine_ext(url, default_ext=u'unknown_video'):
 889     if url is None:
 890         return default_ext
 891     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 892     if re.match(r'^[A-Za-z0-9]+$', guess):
 893         return guess
 894     else:
 895         return default_ext
 896
 897 def subtitles_filename(filename, sub_lang, sub_format):
 898     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 899
 900 def date_from_str(date_str):
 901     """
 902     Return a datetime object from a string in the format YYYYMMDD or
 903     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 904     today = datetime.date.today()
 905     if date_str == 'now'or date_str == 'today':
 906         return today
 907     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 908     if match is not None:
 909         sign = match.group('sign')
 910         time = int(match.group('time'))
 911         if sign == '-':
 912             time = -time
 913         unit = match.group('unit')
 914         #A bad aproximation?
 915         if unit == 'month':
 916             unit = 'day'
 917             time *= 30
 918         elif unit == 'year':
 919             unit = 'day'
 920             time *= 365
 921         unit += 's'
 922         delta = datetime.timedelta(**{unit: time})
 923         return today + delta
 924     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 925
 926 def hyphenate_date(date_str):
 927     """
 928     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 929     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 930     if match is not None:
 931         return '-'.join(match.groups())
 932     else:
 933         return date_str
 934
 935 class DateRange(object):
 936     """Represents a time interval between two dates"""
 937     def __init__(self, start=None, end=None):
 938         """start and end must be strings in the format accepted by date"""
 939         if start is not None:
 940             self.start = date_from_str(start)
 941         else:
 942             self.start = datetime.datetime.min.date()
 943         if end is not None:
 944             self.end = date_from_str(end)
 945         else:
 946             self.end = datetime.datetime.max.date()
 947         if self.start > self.end:
 948             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 949     @classmethod
 950     def day(cls, day):
 951         """Returns a range that only contains the given day"""
 952         return cls(day,day)
 953     def __contains__(self, date):
 954         """Check if the date is in the range"""
 955         if not isinstance(date, datetime.date):
 956             date = date_from_str(date)
 957         return self.start <= date <= self.end
 958     def __str__(self):
 959         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 960
 961
 962 def platform_name():
 963     """ Returns the platform name as a compat_str """
 964     res = platform.platform()
 965     if isinstance(res, bytes):
 966         res = res.decode(preferredencoding())
 967
 968     assert isinstance(res, compat_str)
 969     return res
 970
 971
 972 def _windows_write_string(s, out):
 973     """ Returns True if the string was written using special methods,
 974     False if it has yet to be written out."""
 975     # Adapted from http://stackoverflow.com/a/3259271/35070
 976
 977     import ctypes
 978     import ctypes.wintypes
 979
 980     WIN_OUTPUT_IDS = {
 981         1: -11,
 982         2: -12,
 983     }
 984
 985     try:
 986         fileno = out.fileno()
 987     except AttributeError:
 988         # If the output stream doesn't have a fileno, it's virtual
 989         return False
 990     if fileno not in WIN_OUTPUT_IDS:
 991         return False
 992
 993     GetStdHandle = ctypes.WINFUNCTYPE(
 994         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 995         ("GetStdHandle", ctypes.windll.kernel32))
 996     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 997
 998     WriteConsoleW = ctypes.WINFUNCTYPE(
 999         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1000         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1001         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1002     written = ctypes.wintypes.DWORD(0)
1003
1004     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1005     FILE_TYPE_CHAR = 0x0002
1006     FILE_TYPE_REMOTE = 0x8000
1007     GetConsoleMode = ctypes.WINFUNCTYPE(
1008         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1009         ctypes.POINTER(ctypes.wintypes.DWORD))(
1010         ("GetConsoleMode", ctypes.windll.kernel32))
1011     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1012
1013     def not_a_console(handle):
1014         if handle == INVALID_HANDLE_VALUE or handle is None:
1015             return True
1016         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1017                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1018
1019     if not_a_console(h):
1020         return False
1021
1022     def next_nonbmp_pos(s):
1023         try:
1024             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1025         except StopIteration:
1026             return len(s)
1027
1028     while s:
1029         count = min(next_nonbmp_pos(s), 1024)
1030
1031         ret = WriteConsoleW(
1032             h, s, count if count else 2, ctypes.byref(written), None)
1033         if ret == 0:
1034             raise OSError('Failed to write string')
1035         if not count:  # We just wrote a non-BMP character
1036             assert written.value == 2
1037             s = s[1:]
1038         else:
1039             assert written.value > 0
1040             s = s[written.value:]
1041     return True
1042
1043
1044 def write_string(s, out=None, encoding=None):
1045     if out is None:
1046         out = sys.stderr
1047     assert type(s) == compat_str
1048
1049     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1050         if _windows_write_string(s, out):
1051             return
1052
1053     if ('b' in getattr(out, 'mode', '') or
1054             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1055         byt = s.encode(encoding or preferredencoding(), 'ignore')
1056         out.write(byt)
1057     elif hasattr(out, 'buffer'):
1058         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1059         byt = s.encode(enc, 'ignore')
1060         out.buffer.write(byt)
1061     else:
1062         out.write(s)
1063     out.flush()
1064
1065
1066 def bytes_to_intlist(bs):
1067     if not bs:
1068         return []
1069     if isinstance(bs[0], int):  # Python 3
1070         return list(bs)
1071     else:
1072         return [ord(c) for c in bs]
1073
1074
1075 def intlist_to_bytes(xs):
1076     if not xs:
1077         return b''
1078     if isinstance(chr(0), bytes):  # Python 2
1079         return ''.join([chr(x) for x in xs])
1080     else:
1081         return bytes(xs)
1082
1083
1084 def get_cachedir(params={}):
1085     cache_root = os.environ.get('XDG_CACHE_HOME',
1086                                 os.path.expanduser('~/.cache'))
1087     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1088
1089
1090 # Cross-platform file locking
1091 if sys.platform == 'win32':
1092     import ctypes.wintypes
1093     import msvcrt
1094
1095     class OVERLAPPED(ctypes.Structure):
1096         _fields_ = [
1097             ('Internal', ctypes.wintypes.LPVOID),
1098             ('InternalHigh', ctypes.wintypes.LPVOID),
1099             ('Offset', ctypes.wintypes.DWORD),
1100             ('OffsetHigh', ctypes.wintypes.DWORD),
1101             ('hEvent', ctypes.wintypes.HANDLE),
1102         ]
1103
1104     kernel32 = ctypes.windll.kernel32
1105     LockFileEx = kernel32.LockFileEx
1106     LockFileEx.argtypes = [
1107         ctypes.wintypes.HANDLE,     # hFile
1108         ctypes.wintypes.DWORD,      # dwFlags
1109         ctypes.wintypes.DWORD,      # dwReserved
1110         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1111         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1112         ctypes.POINTER(OVERLAPPED)  # Overlapped
1113     ]
1114     LockFileEx.restype = ctypes.wintypes.BOOL
1115     UnlockFileEx = kernel32.UnlockFileEx
1116     UnlockFileEx.argtypes = [
1117         ctypes.wintypes.HANDLE,     # hFile
1118         ctypes.wintypes.DWORD,      # dwReserved
1119         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1120         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1121         ctypes.POINTER(OVERLAPPED)  # Overlapped
1122     ]
1123     UnlockFileEx.restype = ctypes.wintypes.BOOL
1124     whole_low = 0xffffffff
1125     whole_high = 0x7fffffff
1126
1127     def _lock_file(f, exclusive):
1128         overlapped = OVERLAPPED()
1129         overlapped.Offset = 0
1130         overlapped.OffsetHigh = 0
1131         overlapped.hEvent = 0
1132         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1133         handle = msvcrt.get_osfhandle(f.fileno())
1134         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1135                           whole_low, whole_high, f._lock_file_overlapped_p):
1136             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1137
1138     def _unlock_file(f):
1139         assert f._lock_file_overlapped_p
1140         handle = msvcrt.get_osfhandle(f.fileno())
1141         if not UnlockFileEx(handle, 0,
1142                             whole_low, whole_high, f._lock_file_overlapped_p):
1143             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1144
1145 else:
1146     import fcntl
1147
1148     def _lock_file(f, exclusive):
1149         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1150
1151     def _unlock_file(f):
1152         fcntl.lockf(f, fcntl.LOCK_UN)
1153
1154
1155 class locked_file(object):
1156     def __init__(self, filename, mode, encoding=None):
1157         assert mode in ['r', 'a', 'w']
1158         self.f = io.open(filename, mode, encoding=encoding)
1159         self.mode = mode
1160
1161     def __enter__(self):
1162         exclusive = self.mode != 'r'
1163         try:
1164             _lock_file(self.f, exclusive)
1165         except IOError:
1166             self.f.close()
1167             raise
1168         return self
1169
1170     def __exit__(self, etype, value, traceback):
1171         try:
1172             _unlock_file(self.f)
1173         finally:
1174             self.f.close()
1175
1176     def __iter__(self):
1177         return iter(self.f)
1178
1179     def write(self, *args):
1180         return self.f.write(*args)
1181
1182     def read(self, *args):
1183         return self.f.read(*args)
1184
1185
1186 def shell_quote(args):
1187     quoted_args = []
1188     encoding = sys.getfilesystemencoding()
1189     if encoding is None:
1190         encoding = 'utf-8'
1191     for a in args:
1192         if isinstance(a, bytes):
1193             # We may get a filename encoded with 'encodeFilename'
1194             a = a.decode(encoding)
1195         quoted_args.append(pipes.quote(a))
1196     return u' '.join(quoted_args)
1197
1198
1199 def takewhile_inclusive(pred, seq):
1200     """ Like itertools.takewhile, but include the latest evaluated element
1201         (the first element so that Not pred(e)) """
1202     for e in seq:
1203         yield e
1204         if not pred(e):
1205             return
1206
1207
1208 def smuggle_url(url, data):
1209     """ Pass additional data in a URL for internal use. """
1210
1211     sdata = compat_urllib_parse.urlencode(
1212         {u'__youtubedl_smuggle': json.dumps(data)})
1213     return url + u'#' + sdata
1214
1215
1216 def unsmuggle_url(smug_url, default=None):
1217     if not '#__youtubedl_smuggle' in smug_url:
1218         return smug_url, default
1219     url, _, sdata = smug_url.rpartition(u'#')
1220     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1221     data = json.loads(jsond)
1222     return url, data
1223
1224
1225 def format_bytes(bytes):
1226     if bytes is None:
1227         return u'N/A'
1228     if type(bytes) is str:
1229         bytes = float(bytes)
1230     if bytes == 0.0:
1231         exponent = 0
1232     else:
1233         exponent = int(math.log(bytes, 1024.0))
1234     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1235     converted = float(bytes) / float(1024 ** exponent)
1236     return u'%.2f%s' % (converted, suffix)
1237
1238
1239 def get_term_width():
1240     columns = os.environ.get('COLUMNS', None)
1241     if columns:
1242         return int(columns)
1243
1244     try:
1245         sp = subprocess.Popen(
1246             ['stty', 'size'],
1247             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1248         out, err = sp.communicate()
1249         return int(out.split()[1])
1250     except:
1251         pass
1252     return None
1253
1254
1255 def month_by_name(name):
1256     """ Return the number of a month by (locale-independently) English name """
1257
1258     ENGLISH_NAMES = [
1259         u'January', u'February', u'March', u'April', u'May', u'June',
1260         u'July', u'August', u'September', u'October', u'November', u'December']
1261     try:
1262         return ENGLISH_NAMES.index(name) + 1
1263     except ValueError:
1264         return None
1265
1266
1267 def fix_xml_ampersands(xml_str):
1268     """Replace all the '&' by '&amp;' in XML"""
1269     return re.sub(
1270         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1271         u'&amp;',
1272         xml_str)
1273
1274
1275 def setproctitle(title):
1276     assert isinstance(title, compat_str)
1277     try:
1278         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1279     except OSError:
1280         return
1281     title_bytes = title.encode('utf-8')
1282     buf = ctypes.create_string_buffer(len(title_bytes))
1283     buf.value = title_bytes
1284     try:
1285         libc.prctl(15, buf, 0, 0, 0)
1286     except AttributeError:
1287         return  # Strange libc, just skip this
1288
1289
1290 def remove_start(s, start):
1291     if s.startswith(start):
1292         return s[len(start):]
1293     return s
1294
1295
1296 def remove_end(s, end):
1297     if s.endswith(end):
1298         return s[:-len(end)]
1299     return s
1300
1301
1302 def url_basename(url):
1303     path = compat_urlparse.urlparse(url).path
1304     return path.strip(u'/').split(u'/')[-1]
1305
1306
1307 class HEADRequest(compat_urllib_request.Request):
1308     def get_method(self):
1309         return "HEAD"
1310
1311
1312 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1313     if get_attr:
1314         if v is not None:
1315             v = getattr(v, get_attr, None)
1316     if v == '':
1317         v = None
1318     return default if v is None else (int(v) * invscale // scale)
1319
1320
1321 def str_or_none(v, default=None):
1322     return default if v is None else compat_str(v)
1323
1324
1325 def str_to_int(int_str):
1326     if int_str is None:
1327         return None
1328     int_str = re.sub(r'[,\.]', u'', int_str)
1329     return int(int_str)
1330
1331
1332 def float_or_none(v, scale=1, invscale=1, default=None):
1333     return default if v is None else (float(v) * invscale / scale)
1334
1335
1336 def parse_duration(s):
1337     if s is None:
1338         return None
1339
1340     m = re.match(
1341         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?(?P<ms>\.[0-9]+)?$', s)
1342     if not m:
1343         return None
1344     res = int(m.group('secs'))
1345     if m.group('mins'):
1346         res += int(m.group('mins')) * 60
1347         if m.group('hours'):
1348             res += int(m.group('hours')) * 60 * 60
1349     if m.group('ms'):
1350         res += float(m.group('ms'))
1351     return res
1352
1353
1354 def prepend_extension(filename, ext):
1355     name, real_ext = os.path.splitext(filename)
1356     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1357
1358
1359 def check_executable(exe, args=[]):
1360     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1361     args can be a list of arguments for a short output (like -version) """
1362     try:
1363         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1364     except OSError:
1365         return False
1366     return exe
1367
1368
1369 class PagedList(object):
1370     def __init__(self, pagefunc, pagesize):
1371         self._pagefunc = pagefunc
1372         self._pagesize = pagesize
1373
1374     def __len__(self):
1375         # This is only useful for tests
1376         return len(self.getslice())
1377
1378     def getslice(self, start=0, end=None):
1379         res = []
1380         for pagenum in itertools.count(start // self._pagesize):
1381             firstid = pagenum * self._pagesize
1382             nextfirstid = pagenum * self._pagesize + self._pagesize
1383             if start >= nextfirstid:
1384                 continue
1385
1386             page_results = list(self._pagefunc(pagenum))
1387
1388             startv = (
1389                 start % self._pagesize
1390                 if firstid <= start < nextfirstid
1391                 else 0)
1392
1393             endv = (
1394                 ((end - 1) % self._pagesize) + 1
1395                 if (end is not None and firstid <= end <= nextfirstid)
1396                 else None)
1397
1398             if startv != 0 or endv is not None:
1399                 page_results = page_results[startv:endv]
1400             res.extend(page_results)
1401
1402             # A little optimization - if current page is not "full", ie. does
1403             # not contain page_size videos then we can assume that this page
1404             # is the last one - there are no more ids on further pages -
1405             # i.e. no need to query again.
1406             if len(page_results) + startv < self._pagesize:
1407                 break
1408
1409             # If we got the whole page, but the next page is not interesting,
1410             # break out early as well
1411             if end == nextfirstid:
1412                 break
1413         return res
1414
1415
1416 def uppercase_escape(s):
1417     unicode_escape = codecs.getdecoder('unicode_escape')
1418     return re.sub(
1419         r'\\U[0-9a-fA-F]{8}',
1420         lambda m: unicode_escape(m.group(0))[0],
1421         s)
1422
1423 try:
1424     struct.pack(u'!I', 0)
1425 except TypeError:
1426     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1427     def struct_pack(spec, *args):
1428         if isinstance(spec, compat_str):
1429             spec = spec.encode('ascii')
1430         return struct.pack(spec, *args)
1431
1432     def struct_unpack(spec, *args):
1433         if isinstance(spec, compat_str):
1434             spec = spec.encode('ascii')
1435         return struct.unpack(spec, *args)
1436 else:
1437     struct_pack = struct.pack
1438     struct_unpack = struct.unpack
1439
1440
1441 def read_batch_urls(batch_fd):
1442     def fixup(url):
1443         if not isinstance(url, compat_str):
1444             url = url.decode('utf-8', 'replace')
1445         BOM_UTF8 = u'\xef\xbb\xbf'
1446         if url.startswith(BOM_UTF8):
1447             url = url[len(BOM_UTF8):]
1448         url = url.strip()
1449         if url.startswith(('#', ';', ']')):
1450             return False
1451         return url
1452
1453     with contextlib.closing(batch_fd) as fd:
1454         return [url for url in map(fixup, fd) if url]
1455
1456
1457 def urlencode_postdata(*args, **kargs):
1458     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1459
1460
1461 try:
1462     etree_iter = xml.etree.ElementTree.Element.iter
1463 except AttributeError:  # Python <=2.6
1464     etree_iter = lambda n: n.findall('.//*')
1465
1466
1467 def parse_xml(s):
1468     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1469         def doctype(self, name, pubid, system):
1470             pass  # Ignore doctypes
1471
1472     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1473     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1474     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1475     # Fix up XML parser in Python 2.x
1476     if sys.version_info < (3, 0):
1477         for n in etree_iter(tree):
1478             if n.text is not None:
1479                 if not isinstance(n.text, compat_str):
1480                     n.text = n.text.decode('utf-8')
1481     return tree
1482
1483
1484 if sys.version_info < (3, 0) and sys.platform == 'win32':
1485     def compat_getpass(prompt, *args, **kwargs):
1486         if isinstance(prompt, compat_str):
1487             prompt = prompt.encode(preferredencoding())
1488         return getpass.getpass(prompt, *args, **kwargs)
1489 else:
1490     compat_getpass = getpass.getpass
1491
1492
1493 US_RATINGS = {
1494     'G': 0,
1495     'PG': 10,
1496     'PG-13': 13,
1497     'R': 16,
1498     'NC': 18,
1499 }
1500
1501
1502 def strip_jsonp(code):
1503     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1504
1505
1506 def js_to_json(code):
1507     def fix_kv(m):
1508         key = m.group(2)
1509         if key.startswith("'"):
1510             assert key.endswith("'")
1511             assert '"' not in key
1512             key = '"%s"' % key[1:-1]
1513         elif not key.startswith('"'):
1514             key = '"%s"' % key
1515
1516         value = m.group(4)
1517         if value.startswith("'"):
1518             assert value.endswith("'")
1519             assert '"' not in value
1520             value = '"%s"' % value[1:-1]
1521
1522         return m.group(1) + key + m.group(3) + value
1523
1524     res = re.sub(r'''(?x)
1525             ([{,]\s*)
1526             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1527             (:\s*)
1528             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1529         ''', fix_kv, code)
1530     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1531     return res
1532
1533
1534 def qualities(quality_ids):
1535     """ Get a numeric quality value out of a list of possible values """
1536     def q(qid):
1537         try:
1538             return quality_ids.index(qid)
1539         except ValueError:
1540             return -1
1541     return q
1542
1543
1544 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1545
1546 try:
1547     subprocess_check_output = subprocess.check_output
1548 except AttributeError:
1549     def subprocess_check_output(*args, **kwargs):
1550         assert 'input' not in kwargs
1551         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1552         output, _ = p.communicate()
1553         ret = p.poll()
1554         if ret:
1555             raise subprocess.CalledProcessError(ret, p.args, output=output)
1556         return output