youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 def compat_ord(c):
 196     if type(c) is int: return c
 197     else: return ord(c)
 198
 199 # This is not clearly defined otherwise
 200 compiled_regex_type = type(re.compile(''))
 201
 202 std_headers = {
 203     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 204     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 205     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 206     'Accept-Encoding': 'gzip, deflate',
 207     'Accept-Language': 'en-us,en;q=0.5',
 208 }
 209
 210 def preferredencoding():
 211     """Get preferred encoding.
 212
 213     Returns the best encoding scheme for the system, based on
 214     locale.getpreferredencoding() and some further tweaks.
 215     """
 216     try:
 217         pref = locale.getpreferredencoding()
 218         u'TEST'.encode(pref)
 219     except:
 220         pref = 'UTF-8'
 221
 222     return pref
 223
 224 if sys.version_info < (3,0):
 225     def compat_print(s):
 226         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 227 else:
 228     def compat_print(s):
 229         assert type(s) == type(u'')
 230         print(s)
 231
 232
 233 def write_json_file(obj, fn):
 234     """ Encode obj as JSON and write it to fn, atomically """
 235
 236     # In Python 2.x, json.dump expects a bytestream.
 237     # In Python 3.x, it writes to a character stream
 238     if sys.version_info < (3, 0):
 239         mode = 'wb'
 240         encoding = None
 241     else:
 242         mode = 'w'
 243         encoding = 'utf-8'
 244     tf = tempfile.NamedTemporaryFile(
 245         suffix='.tmp', prefix=os.path.basename(fn) + '.',
 246         dir=os.path.dirname(fn),
 247         delete=False)
 248
 249     try:
 250         with tf:
 251             json.dump(obj, tf)
 252         os.rename(tf.name, fn)
 253     except:
 254         try:
 255             os.remove(tf.name)
 256         except OSError:
 257             pass
 258         raise
 259
 260
 261 if sys.version_info >= (2, 7):
 262     def find_xpath_attr(node, xpath, key, val):
 263         """ Find the xpath xpath[@key=val] """
 264         assert re.match(r'^[a-zA-Z-]+$', key)
 265         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 266         expr = xpath + u"[@%s='%s']" % (key, val)
 267         return node.find(expr)
 268 else:
 269     def find_xpath_attr(node, xpath, key, val):
 270         for f in node.findall(xpath):
 271             if f.attrib.get(key) == val:
 272                 return f
 273         return None
 274
 275 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 276 # the namespace parameter
 277 def xpath_with_ns(path, ns_map):
 278     components = [c.split(':') for c in path.split('/')]
 279     replaced = []
 280     for c in components:
 281         if len(c) == 1:
 282             replaced.append(c[0])
 283         else:
 284             ns, tag = c
 285             replaced.append('{%s}%s' % (ns_map[ns], tag))
 286     return '/'.join(replaced)
 287
 288 def htmlentity_transform(matchobj):
 289     """Transforms an HTML entity to a character.
 290
 291     This function receives a match object and is intended to be used with
 292     the re.sub() function.
 293     """
 294     entity = matchobj.group(1)
 295
 296     # Known non-numeric HTML entity
 297     if entity in compat_html_entities.name2codepoint:
 298         return compat_chr(compat_html_entities.name2codepoint[entity])
 299
 300     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 301     if mobj is not None:
 302         numstr = mobj.group(1)
 303         if numstr.startswith(u'x'):
 304             base = 16
 305             numstr = u'0%s' % numstr
 306         else:
 307             base = 10
 308         return compat_chr(int(numstr, base))
 309
 310     # Unknown entity in name, return its literal representation
 311     return (u'&%s;' % entity)
 312
 313 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 314 class BaseHTMLParser(compat_html_parser.HTMLParser):
 315     def __init(self):
 316         compat_html_parser.HTMLParser.__init__(self)
 317         self.html = None
 318
 319     def loads(self, html):
 320         self.html = html
 321         self.feed(html)
 322         self.close()
 323
 324 class AttrParser(BaseHTMLParser):
 325     """Modified HTMLParser that isolates a tag with the specified attribute"""
 326     def __init__(self, attribute, value):
 327         self.attribute = attribute
 328         self.value = value
 329         self.result = None
 330         self.started = False
 331         self.depth = {}
 332         self.watch_startpos = False
 333         self.error_count = 0
 334         BaseHTMLParser.__init__(self)
 335
 336     def error(self, message):
 337         if self.error_count > 10 or self.started:
 338             raise compat_html_parser.HTMLParseError(message, self.getpos())
 339         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 340         self.error_count += 1
 341         self.goahead(1)
 342
 343     def handle_starttag(self, tag, attrs):
 344         attrs = dict(attrs)
 345         if self.started:
 346             self.find_startpos(None)
 347         if self.attribute in attrs and attrs[self.attribute] == self.value:
 348             self.result = [tag]
 349             self.started = True
 350             self.watch_startpos = True
 351         if self.started:
 352             if not tag in self.depth: self.depth[tag] = 0
 353             self.depth[tag] += 1
 354
 355     def handle_endtag(self, tag):
 356         if self.started:
 357             if tag in self.depth: self.depth[tag] -= 1
 358             if self.depth[self.result[0]] == 0:
 359                 self.started = False
 360                 self.result.append(self.getpos())
 361
 362     def find_startpos(self, x):
 363         """Needed to put the start position of the result (self.result[1])
 364         after the opening tag with the requested id"""
 365         if self.watch_startpos:
 366             self.watch_startpos = False
 367             self.result.append(self.getpos())
 368     handle_entityref = handle_charref = handle_data = handle_comment = \
 369     handle_decl = handle_pi = unknown_decl = find_startpos
 370
 371     def get_result(self):
 372         if self.result is None:
 373             return None
 374         if len(self.result) != 3:
 375             return None
 376         lines = self.html.split('\n')
 377         lines = lines[self.result[1][0]-1:self.result[2][0]]
 378         lines[0] = lines[0][self.result[1][1]:]
 379         if len(lines) == 1:
 380             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 381         lines[-1] = lines[-1][:self.result[2][1]]
 382         return '\n'.join(lines).strip()
 383 # Hack for https://github.com/rg3/youtube-dl/issues/662
 384 if sys.version_info < (2, 7, 3):
 385     AttrParser.parse_endtag = (lambda self, i:
 386         i + len("</scr'+'ipt>")
 387         if self.rawdata[i:].startswith("</scr'+'ipt>")
 388         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 389
 390 def get_element_by_id(id, html):
 391     """Return the content of the tag with the specified ID in the passed HTML document"""
 392     return get_element_by_attribute("id", id, html)
 393
 394 def get_element_by_attribute(attribute, value, html):
 395     """Return the content of the tag with the specified attribute in the passed HTML document"""
 396     parser = AttrParser(attribute, value)
 397     try:
 398         parser.loads(html)
 399     except compat_html_parser.HTMLParseError:
 400         pass
 401     return parser.get_result()
 402
 403 class MetaParser(BaseHTMLParser):
 404     """
 405     Modified HTMLParser that isolates a meta tag with the specified name
 406     attribute.
 407     """
 408     def __init__(self, name):
 409         BaseHTMLParser.__init__(self)
 410         self.name = name
 411         self.content = None
 412         self.result = None
 413
 414     def handle_starttag(self, tag, attrs):
 415         if tag != 'meta':
 416             return
 417         attrs = dict(attrs)
 418         if attrs.get('name') == self.name:
 419             self.result = attrs.get('content')
 420
 421     def get_result(self):
 422         return self.result
 423
 424 def get_meta_content(name, html):
 425     """
 426     Return the content attribute from the meta tag with the given name attribute.
 427     """
 428     parser = MetaParser(name)
 429     try:
 430         parser.loads(html)
 431     except compat_html_parser.HTMLParseError:
 432         pass
 433     return parser.get_result()
 434
 435
 436 def clean_html(html):
 437     """Clean an HTML snippet into a readable string"""
 438     # Newline vs <br />
 439     html = html.replace('\n', ' ')
 440     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 441     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 442     # Strip html tags
 443     html = re.sub('<.*?>', '', html)
 444     # Replace html entities
 445     html = unescapeHTML(html)
 446     return html.strip()
 447
 448
 449 def sanitize_open(filename, open_mode):
 450     """Try to open the given filename, and slightly tweak it if this fails.
 451
 452     Attempts to open the given filename. If this fails, it tries to change
 453     the filename slightly, step by step, until it's either able to open it
 454     or it fails and raises a final exception, like the standard open()
 455     function.
 456
 457     It returns the tuple (stream, definitive_file_name).
 458     """
 459     try:
 460         if filename == u'-':
 461             if sys.platform == 'win32':
 462                 import msvcrt
 463                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 464             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 465         stream = open(encodeFilename(filename), open_mode)
 466         return (stream, filename)
 467     except (IOError, OSError) as err:
 468         if err.errno in (errno.EACCES,):
 469             raise
 470
 471         # In case of error, try to remove win32 forbidden chars
 472         alt_filename = os.path.join(
 473                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 474                         for path_part in os.path.split(filename)
 475                        )
 476         if alt_filename == filename:
 477             raise
 478         else:
 479             # An exception here should be caught in the caller
 480             stream = open(encodeFilename(filename), open_mode)
 481             return (stream, alt_filename)
 482
 483
 484 def timeconvert(timestr):
 485     """Convert RFC 2822 defined time string into system timestamp"""
 486     timestamp = None
 487     timetuple = email.utils.parsedate_tz(timestr)
 488     if timetuple is not None:
 489         timestamp = email.utils.mktime_tz(timetuple)
 490     return timestamp
 491
 492 def sanitize_filename(s, restricted=False, is_id=False):
 493     """Sanitizes a string so it could be used as part of a filename.
 494     If restricted is set, use a stricter subset of allowed characters.
 495     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 496     """
 497     def replace_insane(char):
 498         if char == '?' or ord(char) < 32 or ord(char) == 127:
 499             return ''
 500         elif char == '"':
 501             return '' if restricted else '\''
 502         elif char == ':':
 503             return '_-' if restricted else ' -'
 504         elif char in '\\/|*<>':
 505             return '_'
 506         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 507             return '_'
 508         if restricted and ord(char) > 127:
 509             return '_'
 510         return char
 511
 512     result = u''.join(map(replace_insane, s))
 513     if not is_id:
 514         while '__' in result:
 515             result = result.replace('__', '_')
 516         result = result.strip('_')
 517         # Common case of "Foreign band name - English song title"
 518         if restricted and result.startswith('-_'):
 519             result = result[2:]
 520         if not result:
 521             result = '_'
 522     return result
 523
 524 def orderedSet(iterable):
 525     """ Remove all duplicates from the input iterable """
 526     res = []
 527     for el in iterable:
 528         if el not in res:
 529             res.append(el)
 530     return res
 531
 532
 533 def unescapeHTML(s):
 534     if s is None:
 535         return None
 536     assert type(s) == compat_str
 537
 538     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 539     return result
 540
 541
 542 def encodeFilename(s, for_subprocess=False):
 543     """
 544     @param s The name of the file
 545     """
 546
 547     assert type(s) == compat_str
 548
 549     # Python 3 has a Unicode API
 550     if sys.version_info >= (3, 0):
 551         return s
 552
 553     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 554         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 555         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 556         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 557         if not for_subprocess:
 558             return s
 559         else:
 560             # For subprocess calls, encode with locale encoding
 561             # Refer to http://stackoverflow.com/a/9951851/35070
 562             encoding = preferredencoding()
 563     else:
 564         encoding = sys.getfilesystemencoding()
 565     if encoding is None:
 566         encoding = 'utf-8'
 567     return s.encode(encoding, 'ignore')
 568
 569
 570 def encodeArgument(s):
 571     if not isinstance(s, compat_str):
 572         # Legacy code that uses byte strings
 573         # Uncomment the following line after fixing all post processors
 574         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 575         s = s.decode('ascii')
 576     return encodeFilename(s, True)
 577
 578
 579 def decodeOption(optval):
 580     if optval is None:
 581         return optval
 582     if isinstance(optval, bytes):
 583         optval = optval.decode(preferredencoding())
 584
 585     assert isinstance(optval, compat_str)
 586     return optval
 587
 588 def formatSeconds(secs):
 589     if secs > 3600:
 590         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 591     elif secs > 60:
 592         return '%d:%02d' % (secs // 60, secs % 60)
 593     else:
 594         return '%d' % secs
 595
 596
 597 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 598     if sys.version_info < (3, 2):
 599         import httplib
 600
 601         class HTTPSConnectionV3(httplib.HTTPSConnection):
 602             def __init__(self, *args, **kwargs):
 603                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 604
 605             def connect(self):
 606                 sock = socket.create_connection((self.host, self.port), self.timeout)
 607                 if getattr(self, '_tunnel_host', False):
 608                     self.sock = sock
 609                     self._tunnel()
 610                 try:
 611                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 612                 except ssl.SSLError:
 613                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 614
 615         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 616             def https_open(self, req):
 617                 return self.do_open(HTTPSConnectionV3, req)
 618         return HTTPSHandlerV3(**kwargs)
 619     else:
 620         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 621         context.verify_mode = (ssl.CERT_NONE
 622                                if opts_no_check_certificate
 623                                else ssl.CERT_REQUIRED)
 624         context.set_default_verify_paths()
 625         try:
 626             context.load_default_certs()
 627         except AttributeError:
 628             pass  # Python < 3.4
 629         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 630
 631 class ExtractorError(Exception):
 632     """Error during info extraction."""
 633     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 634         """ tb, if given, is the original traceback (so that it can be printed out).
 635         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 636         """
 637
 638         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 639             expected = True
 640         if video_id is not None:
 641             msg = video_id + ': ' + msg
 642         if not expected:
 643             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 644         super(ExtractorError, self).__init__(msg)
 645
 646         self.traceback = tb
 647         self.exc_info = sys.exc_info()  # preserve original exception
 648         self.cause = cause
 649         self.video_id = video_id
 650
 651     def format_traceback(self):
 652         if self.traceback is None:
 653             return None
 654         return u''.join(traceback.format_tb(self.traceback))
 655
 656
 657 class RegexNotFoundError(ExtractorError):
 658     """Error when a regex didn't match"""
 659     pass
 660
 661
 662 class DownloadError(Exception):
 663     """Download Error exception.
 664
 665     This exception may be thrown by FileDownloader objects if they are not
 666     configured to continue on errors. They will contain the appropriate
 667     error message.
 668     """
 669     def __init__(self, msg, exc_info=None):
 670         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 671         super(DownloadError, self).__init__(msg)
 672         self.exc_info = exc_info
 673
 674
 675 class SameFileError(Exception):
 676     """Same File exception.
 677
 678     This exception will be thrown by FileDownloader objects if they detect
 679     multiple files would have to be downloaded to the same file on disk.
 680     """
 681     pass
 682
 683
 684 class PostProcessingError(Exception):
 685     """Post Processing exception.
 686
 687     This exception may be raised by PostProcessor's .run() method to
 688     indicate an error in the postprocessing task.
 689     """
 690     def __init__(self, msg):
 691         self.msg = msg
 692
 693 class MaxDownloadsReached(Exception):
 694     """ --max-downloads limit has been reached. """
 695     pass
 696
 697
 698 class UnavailableVideoError(Exception):
 699     """Unavailable Format exception.
 700
 701     This exception will be thrown when a video is requested
 702     in a format that is not available for that video.
 703     """
 704     pass
 705
 706
 707 class ContentTooShortError(Exception):
 708     """Content Too Short exception.
 709
 710     This exception may be raised by FileDownloader objects when a file they
 711     download is too small for what the server announced first, indicating
 712     the connection was probably interrupted.
 713     """
 714     # Both in bytes
 715     downloaded = None
 716     expected = None
 717
 718     def __init__(self, downloaded, expected):
 719         self.downloaded = downloaded
 720         self.expected = expected
 721
 722 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 723     """Handler for HTTP requests and responses.
 724
 725     This class, when installed with an OpenerDirector, automatically adds
 726     the standard headers to every HTTP request and handles gzipped and
 727     deflated responses from web servers. If compression is to be avoided in
 728     a particular request, the original request in the program code only has
 729     to include the HTTP header "Youtubedl-No-Compression", which will be
 730     removed before making the real request.
 731
 732     Part of this code was copied from:
 733
 734     http://techknack.net/python-urllib2-handlers/
 735
 736     Andrew Rowls, the author of that code, agreed to release it to the
 737     public domain.
 738     """
 739
 740     @staticmethod
 741     def deflate(data):
 742         try:
 743             return zlib.decompress(data, -zlib.MAX_WBITS)
 744         except zlib.error:
 745             return zlib.decompress(data)
 746
 747     @staticmethod
 748     def addinfourl_wrapper(stream, headers, url, code):
 749         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 750             return compat_urllib_request.addinfourl(stream, headers, url, code)
 751         ret = compat_urllib_request.addinfourl(stream, headers, url)
 752         ret.code = code
 753         return ret
 754
 755     def http_request(self, req):
 756         for h,v in std_headers.items():
 757             if h in req.headers:
 758                 del req.headers[h]
 759             req.add_header(h, v)
 760         if 'Youtubedl-no-compression' in req.headers:
 761             if 'Accept-encoding' in req.headers:
 762                 del req.headers['Accept-encoding']
 763             del req.headers['Youtubedl-no-compression']
 764         if 'Youtubedl-user-agent' in req.headers:
 765             if 'User-agent' in req.headers:
 766                 del req.headers['User-agent']
 767             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 768             del req.headers['Youtubedl-user-agent']
 769         return req
 770
 771     def http_response(self, req, resp):
 772         old_resp = resp
 773         # gzip
 774         if resp.headers.get('Content-encoding', '') == 'gzip':
 775             content = resp.read()
 776             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 777             try:
 778                 uncompressed = io.BytesIO(gz.read())
 779             except IOError as original_ioerror:
 780                 # There may be junk add the end of the file
 781                 # See http://stackoverflow.com/q/4928560/35070 for details
 782                 for i in range(1, 1024):
 783                     try:
 784                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 785                         uncompressed = io.BytesIO(gz.read())
 786                     except IOError:
 787                         continue
 788                     break
 789                 else:
 790                     raise original_ioerror
 791             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 792             resp.msg = old_resp.msg
 793         # deflate
 794         if resp.headers.get('Content-encoding', '') == 'deflate':
 795             gz = io.BytesIO(self.deflate(resp.read()))
 796             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 797             resp.msg = old_resp.msg
 798         return resp
 799
 800     https_request = http_request
 801     https_response = http_response
 802
 803
 804 def parse_iso8601(date_str, delimiter='T'):
 805     """ Return a UNIX timestamp from the given date """
 806
 807     if date_str is None:
 808         return None
 809
 810     m = re.search(
 811         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 812         date_str)
 813     if not m:
 814         timezone = datetime.timedelta()
 815     else:
 816         date_str = date_str[:-len(m.group(0))]
 817         if not m.group('sign'):
 818             timezone = datetime.timedelta()
 819         else:
 820             sign = 1 if m.group('sign') == '+' else -1
 821             timezone = datetime.timedelta(
 822                 hours=sign * int(m.group('hours')),
 823                 minutes=sign * int(m.group('minutes')))
 824     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 825     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 826     return calendar.timegm(dt.timetuple())
 827
 828
 829 def unified_strdate(date_str):
 830     """Return a string with the date in the format YYYYMMDD"""
 831
 832     if date_str is None:
 833         return None
 834
 835     upload_date = None
 836     #Replace commas
 837     date_str = date_str.replace(',', ' ')
 838     # %z (UTC offset) is only supported in python>=3.2
 839     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 840     format_expressions = [
 841         '%d %B %Y',
 842         '%d %b %Y',
 843         '%B %d %Y',
 844         '%b %d %Y',
 845         '%b %dst %Y %I:%M%p',
 846         '%b %dnd %Y %I:%M%p',
 847         '%b %dth %Y %I:%M%p',
 848         '%Y-%m-%d',
 849         '%Y/%m/%d',
 850         '%d.%m.%Y',
 851         '%d/%m/%Y',
 852         '%Y/%m/%d %H:%M:%S',
 853         '%Y-%m-%d %H:%M:%S',
 854         '%d.%m.%Y %H:%M',
 855         '%d.%m.%Y %H.%M',
 856         '%Y-%m-%dT%H:%M:%SZ',
 857         '%Y-%m-%dT%H:%M:%S.%fZ',
 858         '%Y-%m-%dT%H:%M:%S.%f0Z',
 859         '%Y-%m-%dT%H:%M:%S',
 860         '%Y-%m-%dT%H:%M:%S.%f',
 861         '%Y-%m-%dT%H:%M',
 862     ]
 863     for expression in format_expressions:
 864         try:
 865             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 866         except ValueError:
 867             pass
 868     if upload_date is None:
 869         timetuple = email.utils.parsedate_tz(date_str)
 870         if timetuple:
 871             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 872     return upload_date
 873
 874 def determine_ext(url, default_ext=u'unknown_video'):
 875     if url is None:
 876         return default_ext
 877     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 878     if re.match(r'^[A-Za-z0-9]+$', guess):
 879         return guess
 880     else:
 881         return default_ext
 882
 883 def subtitles_filename(filename, sub_lang, sub_format):
 884     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 885
 886 def date_from_str(date_str):
 887     """
 888     Return a datetime object from a string in the format YYYYMMDD or
 889     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 890     today = datetime.date.today()
 891     if date_str == 'now'or date_str == 'today':
 892         return today
 893     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 894     if match is not None:
 895         sign = match.group('sign')
 896         time = int(match.group('time'))
 897         if sign == '-':
 898             time = -time
 899         unit = match.group('unit')
 900         #A bad aproximation?
 901         if unit == 'month':
 902             unit = 'day'
 903             time *= 30
 904         elif unit == 'year':
 905             unit = 'day'
 906             time *= 365
 907         unit += 's'
 908         delta = datetime.timedelta(**{unit: time})
 909         return today + delta
 910     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 911
 912 def hyphenate_date(date_str):
 913     """
 914     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 915     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 916     if match is not None:
 917         return '-'.join(match.groups())
 918     else:
 919         return date_str
 920
 921 class DateRange(object):
 922     """Represents a time interval between two dates"""
 923     def __init__(self, start=None, end=None):
 924         """start and end must be strings in the format accepted by date"""
 925         if start is not None:
 926             self.start = date_from_str(start)
 927         else:
 928             self.start = datetime.datetime.min.date()
 929         if end is not None:
 930             self.end = date_from_str(end)
 931         else:
 932             self.end = datetime.datetime.max.date()
 933         if self.start > self.end:
 934             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 935     @classmethod
 936     def day(cls, day):
 937         """Returns a range that only contains the given day"""
 938         return cls(day,day)
 939     def __contains__(self, date):
 940         """Check if the date is in the range"""
 941         if not isinstance(date, datetime.date):
 942             date = date_from_str(date)
 943         return self.start <= date <= self.end
 944     def __str__(self):
 945         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 946
 947
 948 def platform_name():
 949     """ Returns the platform name as a compat_str """
 950     res = platform.platform()
 951     if isinstance(res, bytes):
 952         res = res.decode(preferredencoding())
 953
 954     assert isinstance(res, compat_str)
 955     return res
 956
 957
 958 def _windows_write_string(s, out):
 959     """ Returns True if the string was written using special methods,
 960     False if it has yet to be written out."""
 961     # Adapted from http://stackoverflow.com/a/3259271/35070
 962
 963     import ctypes
 964     import ctypes.wintypes
 965
 966     WIN_OUTPUT_IDS = {
 967         1: -11,
 968         2: -12,
 969     }
 970
 971     try:
 972         fileno = out.fileno()
 973     except AttributeError:
 974         # If the output stream doesn't have a fileno, it's virtual
 975         return False
 976     if fileno not in WIN_OUTPUT_IDS:
 977         return False
 978
 979     GetStdHandle = ctypes.WINFUNCTYPE(
 980         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 981         ("GetStdHandle", ctypes.windll.kernel32))
 982     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 983
 984     WriteConsoleW = ctypes.WINFUNCTYPE(
 985         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 986         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 987         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 988     written = ctypes.wintypes.DWORD(0)
 989
 990     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 991     FILE_TYPE_CHAR = 0x0002
 992     FILE_TYPE_REMOTE = 0x8000
 993     GetConsoleMode = ctypes.WINFUNCTYPE(
 994         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 995         ctypes.POINTER(ctypes.wintypes.DWORD))(
 996         ("GetConsoleMode", ctypes.windll.kernel32))
 997     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 998
 999     def not_a_console(handle):
1000         if handle == INVALID_HANDLE_VALUE or handle is None:
1001             return True
1002         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1003                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1004
1005     if not_a_console(h):
1006         return False
1007
1008     def next_nonbmp_pos(s):
1009         try:
1010             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1011         except StopIteration:
1012             return len(s)
1013
1014     while s:
1015         count = min(next_nonbmp_pos(s), 1024)
1016
1017         ret = WriteConsoleW(
1018             h, s, count if count else 2, ctypes.byref(written), None)
1019         if ret == 0:
1020             raise OSError('Failed to write string')
1021         if not count:  # We just wrote a non-BMP character
1022             assert written.value == 2
1023             s = s[1:]
1024         else:
1025             assert written.value > 0
1026             s = s[written.value:]
1027     return True
1028
1029
1030 def write_string(s, out=None, encoding=None):
1031     if out is None:
1032         out = sys.stderr
1033     assert type(s) == compat_str
1034
1035     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1036         if _windows_write_string(s, out):
1037             return
1038
1039     if ('b' in getattr(out, 'mode', '') or
1040             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1041         byt = s.encode(encoding or preferredencoding(), 'ignore')
1042         out.write(byt)
1043     elif hasattr(out, 'buffer'):
1044         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1045         byt = s.encode(enc, 'ignore')
1046         out.buffer.write(byt)
1047     else:
1048         out.write(s)
1049     out.flush()
1050
1051
1052 def bytes_to_intlist(bs):
1053     if not bs:
1054         return []
1055     if isinstance(bs[0], int):  # Python 3
1056         return list(bs)
1057     else:
1058         return [ord(c) for c in bs]
1059
1060
1061 def intlist_to_bytes(xs):
1062     if not xs:
1063         return b''
1064     if isinstance(chr(0), bytes):  # Python 2
1065         return ''.join([chr(x) for x in xs])
1066     else:
1067         return bytes(xs)
1068
1069
1070 def get_cachedir(params={}):
1071     cache_root = os.environ.get('XDG_CACHE_HOME',
1072                                 os.path.expanduser('~/.cache'))
1073     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1074
1075
1076 # Cross-platform file locking
1077 if sys.platform == 'win32':
1078     import ctypes.wintypes
1079     import msvcrt
1080
1081     class OVERLAPPED(ctypes.Structure):
1082         _fields_ = [
1083             ('Internal', ctypes.wintypes.LPVOID),
1084             ('InternalHigh', ctypes.wintypes.LPVOID),
1085             ('Offset', ctypes.wintypes.DWORD),
1086             ('OffsetHigh', ctypes.wintypes.DWORD),
1087             ('hEvent', ctypes.wintypes.HANDLE),
1088         ]
1089
1090     kernel32 = ctypes.windll.kernel32
1091     LockFileEx = kernel32.LockFileEx
1092     LockFileEx.argtypes = [
1093         ctypes.wintypes.HANDLE,     # hFile
1094         ctypes.wintypes.DWORD,      # dwFlags
1095         ctypes.wintypes.DWORD,      # dwReserved
1096         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1097         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1098         ctypes.POINTER(OVERLAPPED)  # Overlapped
1099     ]
1100     LockFileEx.restype = ctypes.wintypes.BOOL
1101     UnlockFileEx = kernel32.UnlockFileEx
1102     UnlockFileEx.argtypes = [
1103         ctypes.wintypes.HANDLE,     # hFile
1104         ctypes.wintypes.DWORD,      # dwReserved
1105         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1106         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1107         ctypes.POINTER(OVERLAPPED)  # Overlapped
1108     ]
1109     UnlockFileEx.restype = ctypes.wintypes.BOOL
1110     whole_low = 0xffffffff
1111     whole_high = 0x7fffffff
1112
1113     def _lock_file(f, exclusive):
1114         overlapped = OVERLAPPED()
1115         overlapped.Offset = 0
1116         overlapped.OffsetHigh = 0
1117         overlapped.hEvent = 0
1118         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1119         handle = msvcrt.get_osfhandle(f.fileno())
1120         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1121                           whole_low, whole_high, f._lock_file_overlapped_p):
1122             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1123
1124     def _unlock_file(f):
1125         assert f._lock_file_overlapped_p
1126         handle = msvcrt.get_osfhandle(f.fileno())
1127         if not UnlockFileEx(handle, 0,
1128                             whole_low, whole_high, f._lock_file_overlapped_p):
1129             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1130
1131 else:
1132     import fcntl
1133
1134     def _lock_file(f, exclusive):
1135         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1136
1137     def _unlock_file(f):
1138         fcntl.lockf(f, fcntl.LOCK_UN)
1139
1140
1141 class locked_file(object):
1142     def __init__(self, filename, mode, encoding=None):
1143         assert mode in ['r', 'a', 'w']
1144         self.f = io.open(filename, mode, encoding=encoding)
1145         self.mode = mode
1146
1147     def __enter__(self):
1148         exclusive = self.mode != 'r'
1149         try:
1150             _lock_file(self.f, exclusive)
1151         except IOError:
1152             self.f.close()
1153             raise
1154         return self
1155
1156     def __exit__(self, etype, value, traceback):
1157         try:
1158             _unlock_file(self.f)
1159         finally:
1160             self.f.close()
1161
1162     def __iter__(self):
1163         return iter(self.f)
1164
1165     def write(self, *args):
1166         return self.f.write(*args)
1167
1168     def read(self, *args):
1169         return self.f.read(*args)
1170
1171
1172 def shell_quote(args):
1173     quoted_args = []
1174     encoding = sys.getfilesystemencoding()
1175     if encoding is None:
1176         encoding = 'utf-8'
1177     for a in args:
1178         if isinstance(a, bytes):
1179             # We may get a filename encoded with 'encodeFilename'
1180             a = a.decode(encoding)
1181         quoted_args.append(pipes.quote(a))
1182     return u' '.join(quoted_args)
1183
1184
1185 def takewhile_inclusive(pred, seq):
1186     """ Like itertools.takewhile, but include the latest evaluated element
1187         (the first element so that Not pred(e)) """
1188     for e in seq:
1189         yield e
1190         if not pred(e):
1191             return
1192
1193
1194 def smuggle_url(url, data):
1195     """ Pass additional data in a URL for internal use. """
1196
1197     sdata = compat_urllib_parse.urlencode(
1198         {u'__youtubedl_smuggle': json.dumps(data)})
1199     return url + u'#' + sdata
1200
1201
1202 def unsmuggle_url(smug_url, default=None):
1203     if not '#__youtubedl_smuggle' in smug_url:
1204         return smug_url, default
1205     url, _, sdata = smug_url.rpartition(u'#')
1206     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1207     data = json.loads(jsond)
1208     return url, data
1209
1210
1211 def format_bytes(bytes):
1212     if bytes is None:
1213         return u'N/A'
1214     if type(bytes) is str:
1215         bytes = float(bytes)
1216     if bytes == 0.0:
1217         exponent = 0
1218     else:
1219         exponent = int(math.log(bytes, 1024.0))
1220     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1221     converted = float(bytes) / float(1024 ** exponent)
1222     return u'%.2f%s' % (converted, suffix)
1223
1224
1225 def get_term_width():
1226     columns = os.environ.get('COLUMNS', None)
1227     if columns:
1228         return int(columns)
1229
1230     try:
1231         sp = subprocess.Popen(
1232             ['stty', 'size'],
1233             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1234         out, err = sp.communicate()
1235         return int(out.split()[1])
1236     except:
1237         pass
1238     return None
1239
1240
1241 def month_by_name(name):
1242     """ Return the number of a month by (locale-independently) English name """
1243
1244     ENGLISH_NAMES = [
1245         u'January', u'February', u'March', u'April', u'May', u'June',
1246         u'July', u'August', u'September', u'October', u'November', u'December']
1247     try:
1248         return ENGLISH_NAMES.index(name) + 1
1249     except ValueError:
1250         return None
1251
1252
1253 def fix_xml_ampersands(xml_str):
1254     """Replace all the '&' by '&amp;' in XML"""
1255     return re.sub(
1256         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1257         u'&amp;',
1258         xml_str)
1259
1260
1261 def setproctitle(title):
1262     assert isinstance(title, compat_str)
1263     try:
1264         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1265     except OSError:
1266         return
1267     title_bytes = title.encode('utf-8')
1268     buf = ctypes.create_string_buffer(len(title_bytes))
1269     buf.value = title_bytes
1270     try:
1271         libc.prctl(15, buf, 0, 0, 0)
1272     except AttributeError:
1273         return  # Strange libc, just skip this
1274
1275
1276 def remove_start(s, start):
1277     if s.startswith(start):
1278         return s[len(start):]
1279     return s
1280
1281
1282 def url_basename(url):
1283     path = compat_urlparse.urlparse(url).path
1284     return path.strip(u'/').split(u'/')[-1]
1285
1286
1287 class HEADRequest(compat_urllib_request.Request):
1288     def get_method(self):
1289         return "HEAD"
1290
1291
1292 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1293     if get_attr:
1294         if v is not None:
1295             v = getattr(v, get_attr, None)
1296     if v == '':
1297         v = None
1298     return default if v is None else (int(v) * invscale // scale)
1299
1300
1301 def str_or_none(v, default=None):
1302     return default if v is None else compat_str(v)
1303
1304
1305 def str_to_int(int_str):
1306     if int_str is None:
1307         return None
1308     int_str = re.sub(r'[,\.]', u'', int_str)
1309     return int(int_str)
1310
1311
1312 def float_or_none(v, scale=1, invscale=1, default=None):
1313     return default if v is None else (float(v) * invscale / scale)
1314
1315
1316 def parse_duration(s):
1317     if s is None:
1318         return None
1319
1320     m = re.match(
1321         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1322     if not m:
1323         return None
1324     res = int(m.group('secs'))
1325     if m.group('mins'):
1326         res += int(m.group('mins')) * 60
1327         if m.group('hours'):
1328             res += int(m.group('hours')) * 60 * 60
1329     return res
1330
1331
1332 def prepend_extension(filename, ext):
1333     name, real_ext = os.path.splitext(filename)
1334     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1335
1336
1337 def check_executable(exe, args=[]):
1338     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1339     args can be a list of arguments for a short output (like -version) """
1340     try:
1341         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1342     except OSError:
1343         return False
1344     return exe
1345
1346
1347 class PagedList(object):
1348     def __init__(self, pagefunc, pagesize):
1349         self._pagefunc = pagefunc
1350         self._pagesize = pagesize
1351
1352     def __len__(self):
1353         # This is only useful for tests
1354         return len(self.getslice())
1355
1356     def getslice(self, start=0, end=None):
1357         res = []
1358         for pagenum in itertools.count(start // self._pagesize):
1359             firstid = pagenum * self._pagesize
1360             nextfirstid = pagenum * self._pagesize + self._pagesize
1361             if start >= nextfirstid:
1362                 continue
1363
1364             page_results = list(self._pagefunc(pagenum))
1365
1366             startv = (
1367                 start % self._pagesize
1368                 if firstid <= start < nextfirstid
1369                 else 0)
1370
1371             endv = (
1372                 ((end - 1) % self._pagesize) + 1
1373                 if (end is not None and firstid <= end <= nextfirstid)
1374                 else None)
1375
1376             if startv != 0 or endv is not None:
1377                 page_results = page_results[startv:endv]
1378             res.extend(page_results)
1379
1380             # A little optimization - if current page is not "full", ie. does
1381             # not contain page_size videos then we can assume that this page
1382             # is the last one - there are no more ids on further pages -
1383             # i.e. no need to query again.
1384             if len(page_results) + startv < self._pagesize:
1385                 break
1386
1387             # If we got the whole page, but the next page is not interesting,
1388             # break out early as well
1389             if end == nextfirstid:
1390                 break
1391         return res
1392
1393
1394 def uppercase_escape(s):
1395     unicode_escape = codecs.getdecoder('unicode_escape')
1396     return re.sub(
1397         r'\\U[0-9a-fA-F]{8}',
1398         lambda m: unicode_escape(m.group(0))[0],
1399         s)
1400
1401 try:
1402     struct.pack(u'!I', 0)
1403 except TypeError:
1404     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1405     def struct_pack(spec, *args):
1406         if isinstance(spec, compat_str):
1407             spec = spec.encode('ascii')
1408         return struct.pack(spec, *args)
1409
1410     def struct_unpack(spec, *args):
1411         if isinstance(spec, compat_str):
1412             spec = spec.encode('ascii')
1413         return struct.unpack(spec, *args)
1414 else:
1415     struct_pack = struct.pack
1416     struct_unpack = struct.unpack
1417
1418
1419 def read_batch_urls(batch_fd):
1420     def fixup(url):
1421         if not isinstance(url, compat_str):
1422             url = url.decode('utf-8', 'replace')
1423         BOM_UTF8 = u'\xef\xbb\xbf'
1424         if url.startswith(BOM_UTF8):
1425             url = url[len(BOM_UTF8):]
1426         url = url.strip()
1427         if url.startswith(('#', ';', ']')):
1428             return False
1429         return url
1430
1431     with contextlib.closing(batch_fd) as fd:
1432         return [url for url in map(fixup, fd) if url]
1433
1434
1435 def urlencode_postdata(*args, **kargs):
1436     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1437
1438
1439 def parse_xml(s):
1440     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1441         def doctype(self, name, pubid, system):
1442             pass  # Ignore doctypes
1443
1444     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1445     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1446     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1447
1448
1449 if sys.version_info < (3, 0) and sys.platform == 'win32':
1450     def compat_getpass(prompt, *args, **kwargs):
1451         if isinstance(prompt, compat_str):
1452             prompt = prompt.encode(preferredencoding())
1453         return getpass.getpass(prompt, *args, **kwargs)
1454 else:
1455     compat_getpass = getpass.getpass
1456
1457
1458 US_RATINGS = {
1459     'G': 0,
1460     'PG': 10,
1461     'PG-13': 13,
1462     'R': 16,
1463     'NC': 18,
1464 }
1465
1466
1467 def strip_jsonp(code):
1468     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1469
1470
1471 def qualities(quality_ids):
1472     """ Get a numeric quality value out of a list of possible values """
1473     def q(qid):
1474         try:
1475             return quality_ids.index(qid)
1476         except ValueError:
1477             return -1
1478     return q
1479
1480
1481 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1482
1483 try:
1484     subprocess_check_output = subprocess.check_output
1485 except AttributeError:
1486     def subprocess_check_output(*args, **kwargs):
1487         assert 'input' not in kwargs
1488         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1489         output, _ = p.communicate()
1490         ret = p.poll()
1491         if ret:
1492             raise subprocess.CalledProcessError(ret, p.args, output=output)
1493         return output