youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import traceback
  28 import xml.etree.ElementTree
  29 import zlib
  30
  31 try:
  32     import urllib.request as compat_urllib_request
  33 except ImportError: # Python 2
  34     import urllib2 as compat_urllib_request
  35
  36 try:
  37     import urllib.error as compat_urllib_error
  38 except ImportError: # Python 2
  39     import urllib2 as compat_urllib_error
  40
  41 try:
  42     import urllib.parse as compat_urllib_parse
  43 except ImportError: # Python 2
  44     import urllib as compat_urllib_parse
  45
  46 try:
  47     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  48 except ImportError: # Python 2
  49     from urlparse import urlparse as compat_urllib_parse_urlparse
  50
  51 try:
  52     import urllib.parse as compat_urlparse
  53 except ImportError: # Python 2
  54     import urlparse as compat_urlparse
  55
  56 try:
  57     import http.cookiejar as compat_cookiejar
  58 except ImportError: # Python 2
  59     import cookielib as compat_cookiejar
  60
  61 try:
  62     import html.entities as compat_html_entities
  63 except ImportError: # Python 2
  64     import htmlentitydefs as compat_html_entities
  65
  66 try:
  67     import html.parser as compat_html_parser
  68 except ImportError: # Python 2
  69     import HTMLParser as compat_html_parser
  70
  71 try:
  72     import http.client as compat_http_client
  73 except ImportError: # Python 2
  74     import httplib as compat_http_client
  75
  76 try:
  77     from urllib.error import HTTPError as compat_HTTPError
  78 except ImportError:  # Python 2
  79     from urllib2 import HTTPError as compat_HTTPError
  80
  81 try:
  82     from urllib.request import urlretrieve as compat_urlretrieve
  83 except ImportError:  # Python 2
  84     from urllib import urlretrieve as compat_urlretrieve
  85
  86
  87 try:
  88     from subprocess import DEVNULL
  89     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  90 except ImportError:
  91     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  92
  93 try:
  94     from urllib.parse import parse_qs as compat_parse_qs
  95 except ImportError: # Python 2
  96     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  97     # Python 2's version is apparently totally broken
  98     def _unquote(string, encoding='utf-8', errors='replace'):
  99         if string == '':
 100             return string
 101         res = string.split('%')
 102         if len(res) == 1:
 103             return string
 104         if encoding is None:
 105             encoding = 'utf-8'
 106         if errors is None:
 107             errors = 'replace'
 108         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 109         pct_sequence = b''
 110         string = res[0]
 111         for item in res[1:]:
 112             try:
 113                 if not item:
 114                     raise ValueError
 115                 pct_sequence += item[:2].decode('hex')
 116                 rest = item[2:]
 117                 if not rest:
 118                     # This segment was just a single percent-encoded character.
 119                     # May be part of a sequence of code units, so delay decoding.
 120                     # (Stored in pct_sequence).
 121                     continue
 122             except ValueError:
 123                 rest = '%' + item
 124             # Encountered non-percent-encoded characters. Flush the current
 125             # pct_sequence.
 126             string += pct_sequence.decode(encoding, errors) + rest
 127             pct_sequence = b''
 128         if pct_sequence:
 129             # Flush the final pct_sequence
 130             string += pct_sequence.decode(encoding, errors)
 131         return string
 132
 133     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 134                 encoding='utf-8', errors='replace'):
 135         qs, _coerce_result = qs, unicode
 136         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 137         r = []
 138         for name_value in pairs:
 139             if not name_value and not strict_parsing:
 140                 continue
 141             nv = name_value.split('=', 1)
 142             if len(nv) != 2:
 143                 if strict_parsing:
 144                     raise ValueError("bad query field: %r" % (name_value,))
 145                 # Handle case of a control-name with no equal sign
 146                 if keep_blank_values:
 147                     nv.append('')
 148                 else:
 149                     continue
 150             if len(nv[1]) or keep_blank_values:
 151                 name = nv[0].replace('+', ' ')
 152                 name = _unquote(name, encoding=encoding, errors=errors)
 153                 name = _coerce_result(name)
 154                 value = nv[1].replace('+', ' ')
 155                 value = _unquote(value, encoding=encoding, errors=errors)
 156                 value = _coerce_result(value)
 157                 r.append((name, value))
 158         return r
 159
 160     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 161                 encoding='utf-8', errors='replace'):
 162         parsed_result = {}
 163         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 164                         encoding=encoding, errors=errors)
 165         for name, value in pairs:
 166             if name in parsed_result:
 167                 parsed_result[name].append(value)
 168             else:
 169                 parsed_result[name] = [value]
 170         return parsed_result
 171
 172 try:
 173     compat_str = unicode # Python 2
 174 except NameError:
 175     compat_str = str
 176
 177 try:
 178     compat_chr = unichr # Python 2
 179 except NameError:
 180     compat_chr = chr
 181
 182 try:
 183     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 184 except ImportError:  # Python 2.6
 185     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 186
 187 def compat_ord(c):
 188     if type(c) is int: return c
 189     else: return ord(c)
 190
 191 # This is not clearly defined otherwise
 192 compiled_regex_type = type(re.compile(''))
 193
 194 std_headers = {
 195     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 196     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 197     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 198     'Accept-Encoding': 'gzip, deflate',
 199     'Accept-Language': 'en-us,en;q=0.5',
 200 }
 201
 202 def preferredencoding():
 203     """Get preferred encoding.
 204
 205     Returns the best encoding scheme for the system, based on
 206     locale.getpreferredencoding() and some further tweaks.
 207     """
 208     try:
 209         pref = locale.getpreferredencoding()
 210         u'TEST'.encode(pref)
 211     except:
 212         pref = 'UTF-8'
 213
 214     return pref
 215
 216 if sys.version_info < (3,0):
 217     def compat_print(s):
 218         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 219 else:
 220     def compat_print(s):
 221         assert type(s) == type(u'')
 222         print(s)
 223
 224 # In Python 2.x, json.dump expects a bytestream.
 225 # In Python 3.x, it writes to a character stream
 226 if sys.version_info < (3,0):
 227     def write_json_file(obj, fn):
 228         with open(fn, 'wb') as f:
 229             json.dump(obj, f)
 230 else:
 231     def write_json_file(obj, fn):
 232         with open(fn, 'w', encoding='utf-8') as f:
 233             json.dump(obj, f)
 234
 235 if sys.version_info >= (2,7):
 236     def find_xpath_attr(node, xpath, key, val):
 237         """ Find the xpath xpath[@key=val] """
 238         assert re.match(r'^[a-zA-Z]+$', key)
 239         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 240         expr = xpath + u"[@%s='%s']" % (key, val)
 241         return node.find(expr)
 242 else:
 243     def find_xpath_attr(node, xpath, key, val):
 244         for f in node.findall(xpath):
 245             if f.attrib.get(key) == val:
 246                 return f
 247         return None
 248
 249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 250 # the namespace parameter
 251 def xpath_with_ns(path, ns_map):
 252     components = [c.split(':') for c in path.split('/')]
 253     replaced = []
 254     for c in components:
 255         if len(c) == 1:
 256             replaced.append(c[0])
 257         else:
 258             ns, tag = c
 259             replaced.append('{%s}%s' % (ns_map[ns], tag))
 260     return '/'.join(replaced)
 261
 262 def htmlentity_transform(matchobj):
 263     """Transforms an HTML entity to a character.
 264
 265     This function receives a match object and is intended to be used with
 266     the re.sub() function.
 267     """
 268     entity = matchobj.group(1)
 269
 270     # Known non-numeric HTML entity
 271     if entity in compat_html_entities.name2codepoint:
 272         return compat_chr(compat_html_entities.name2codepoint[entity])
 273
 274     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 275     if mobj is not None:
 276         numstr = mobj.group(1)
 277         if numstr.startswith(u'x'):
 278             base = 16
 279             numstr = u'0%s' % numstr
 280         else:
 281             base = 10
 282         return compat_chr(int(numstr, base))
 283
 284     # Unknown entity in name, return its literal representation
 285     return (u'&%s;' % entity)
 286
 287 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 288 class BaseHTMLParser(compat_html_parser.HTMLParser):
 289     def __init(self):
 290         compat_html_parser.HTMLParser.__init__(self)
 291         self.html = None
 292
 293     def loads(self, html):
 294         self.html = html
 295         self.feed(html)
 296         self.close()
 297
 298 class AttrParser(BaseHTMLParser):
 299     """Modified HTMLParser that isolates a tag with the specified attribute"""
 300     def __init__(self, attribute, value):
 301         self.attribute = attribute
 302         self.value = value
 303         self.result = None
 304         self.started = False
 305         self.depth = {}
 306         self.watch_startpos = False
 307         self.error_count = 0
 308         BaseHTMLParser.__init__(self)
 309
 310     def error(self, message):
 311         if self.error_count > 10 or self.started:
 312             raise compat_html_parser.HTMLParseError(message, self.getpos())
 313         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 314         self.error_count += 1
 315         self.goahead(1)
 316
 317     def handle_starttag(self, tag, attrs):
 318         attrs = dict(attrs)
 319         if self.started:
 320             self.find_startpos(None)
 321         if self.attribute in attrs and attrs[self.attribute] == self.value:
 322             self.result = [tag]
 323             self.started = True
 324             self.watch_startpos = True
 325         if self.started:
 326             if not tag in self.depth: self.depth[tag] = 0
 327             self.depth[tag] += 1
 328
 329     def handle_endtag(self, tag):
 330         if self.started:
 331             if tag in self.depth: self.depth[tag] -= 1
 332             if self.depth[self.result[0]] == 0:
 333                 self.started = False
 334                 self.result.append(self.getpos())
 335
 336     def find_startpos(self, x):
 337         """Needed to put the start position of the result (self.result[1])
 338         after the opening tag with the requested id"""
 339         if self.watch_startpos:
 340             self.watch_startpos = False
 341             self.result.append(self.getpos())
 342     handle_entityref = handle_charref = handle_data = handle_comment = \
 343     handle_decl = handle_pi = unknown_decl = find_startpos
 344
 345     def get_result(self):
 346         if self.result is None:
 347             return None
 348         if len(self.result) != 3:
 349             return None
 350         lines = self.html.split('\n')
 351         lines = lines[self.result[1][0]-1:self.result[2][0]]
 352         lines[0] = lines[0][self.result[1][1]:]
 353         if len(lines) == 1:
 354             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 355         lines[-1] = lines[-1][:self.result[2][1]]
 356         return '\n'.join(lines).strip()
 357 # Hack for https://github.com/rg3/youtube-dl/issues/662
 358 if sys.version_info < (2, 7, 3):
 359     AttrParser.parse_endtag = (lambda self, i:
 360         i + len("</scr'+'ipt>")
 361         if self.rawdata[i:].startswith("</scr'+'ipt>")
 362         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 363
 364 def get_element_by_id(id, html):
 365     """Return the content of the tag with the specified ID in the passed HTML document"""
 366     return get_element_by_attribute("id", id, html)
 367
 368 def get_element_by_attribute(attribute, value, html):
 369     """Return the content of the tag with the specified attribute in the passed HTML document"""
 370     parser = AttrParser(attribute, value)
 371     try:
 372         parser.loads(html)
 373     except compat_html_parser.HTMLParseError:
 374         pass
 375     return parser.get_result()
 376
 377 class MetaParser(BaseHTMLParser):
 378     """
 379     Modified HTMLParser that isolates a meta tag with the specified name
 380     attribute.
 381     """
 382     def __init__(self, name):
 383         BaseHTMLParser.__init__(self)
 384         self.name = name
 385         self.content = None
 386         self.result = None
 387
 388     def handle_starttag(self, tag, attrs):
 389         if tag != 'meta':
 390             return
 391         attrs = dict(attrs)
 392         if attrs.get('name') == self.name:
 393             self.result = attrs.get('content')
 394
 395     def get_result(self):
 396         return self.result
 397
 398 def get_meta_content(name, html):
 399     """
 400     Return the content attribute from the meta tag with the given name attribute.
 401     """
 402     parser = MetaParser(name)
 403     try:
 404         parser.loads(html)
 405     except compat_html_parser.HTMLParseError:
 406         pass
 407     return parser.get_result()
 408
 409
 410 def clean_html(html):
 411     """Clean an HTML snippet into a readable string"""
 412     # Newline vs <br />
 413     html = html.replace('\n', ' ')
 414     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 415     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 416     # Strip html tags
 417     html = re.sub('<.*?>', '', html)
 418     # Replace html entities
 419     html = unescapeHTML(html)
 420     return html.strip()
 421
 422
 423 def sanitize_open(filename, open_mode):
 424     """Try to open the given filename, and slightly tweak it if this fails.
 425
 426     Attempts to open the given filename. If this fails, it tries to change
 427     the filename slightly, step by step, until it's either able to open it
 428     or it fails and raises a final exception, like the standard open()
 429     function.
 430
 431     It returns the tuple (stream, definitive_file_name).
 432     """
 433     try:
 434         if filename == u'-':
 435             if sys.platform == 'win32':
 436                 import msvcrt
 437                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 438             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 439         stream = open(encodeFilename(filename), open_mode)
 440         return (stream, filename)
 441     except (IOError, OSError) as err:
 442         if err.errno in (errno.EACCES,):
 443             raise
 444
 445         # In case of error, try to remove win32 forbidden chars
 446         alt_filename = os.path.join(
 447                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 448                         for path_part in os.path.split(filename)
 449                        )
 450         if alt_filename == filename:
 451             raise
 452         else:
 453             # An exception here should be caught in the caller
 454             stream = open(encodeFilename(filename), open_mode)
 455             return (stream, alt_filename)
 456
 457
 458 def timeconvert(timestr):
 459     """Convert RFC 2822 defined time string into system timestamp"""
 460     timestamp = None
 461     timetuple = email.utils.parsedate_tz(timestr)
 462     if timetuple is not None:
 463         timestamp = email.utils.mktime_tz(timetuple)
 464     return timestamp
 465
 466 def sanitize_filename(s, restricted=False, is_id=False):
 467     """Sanitizes a string so it could be used as part of a filename.
 468     If restricted is set, use a stricter subset of allowed characters.
 469     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 470     """
 471     def replace_insane(char):
 472         if char == '?' or ord(char) < 32 or ord(char) == 127:
 473             return ''
 474         elif char == '"':
 475             return '' if restricted else '\''
 476         elif char == ':':
 477             return '_-' if restricted else ' -'
 478         elif char in '\\/|*<>':
 479             return '_'
 480         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 481             return '_'
 482         if restricted and ord(char) > 127:
 483             return '_'
 484         return char
 485
 486     result = u''.join(map(replace_insane, s))
 487     if not is_id:
 488         while '__' in result:
 489             result = result.replace('__', '_')
 490         result = result.strip('_')
 491         # Common case of "Foreign band name - English song title"
 492         if restricted and result.startswith('-_'):
 493             result = result[2:]
 494         if not result:
 495             result = '_'
 496     return result
 497
 498 def orderedSet(iterable):
 499     """ Remove all duplicates from the input iterable """
 500     res = []
 501     for el in iterable:
 502         if el not in res:
 503             res.append(el)
 504     return res
 505
 506
 507 def unescapeHTML(s):
 508     if s is None:
 509         return None
 510     assert type(s) == compat_str
 511
 512     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 513     return result
 514
 515
 516 def encodeFilename(s, for_subprocess=False):
 517     """
 518     @param s The name of the file
 519     """
 520
 521     assert type(s) == compat_str
 522
 523     # Python 3 has a Unicode API
 524     if sys.version_info >= (3, 0):
 525         return s
 526
 527     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 528         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 529         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 530         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 531         if not for_subprocess:
 532             return s
 533         else:
 534             # For subprocess calls, encode with locale encoding
 535             # Refer to http://stackoverflow.com/a/9951851/35070
 536             encoding = preferredencoding()
 537     else:
 538         encoding = sys.getfilesystemencoding()
 539     if encoding is None:
 540         encoding = 'utf-8'
 541     return s.encode(encoding, 'ignore')
 542
 543
 544 def encodeArgument(s):
 545     if not isinstance(s, compat_str):
 546         # Legacy code that uses byte strings
 547         # Uncomment the following line after fixing all post processors
 548         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 549         s = s.decode('ascii')
 550     return encodeFilename(s, True)
 551
 552
 553 def decodeOption(optval):
 554     if optval is None:
 555         return optval
 556     if isinstance(optval, bytes):
 557         optval = optval.decode(preferredencoding())
 558
 559     assert isinstance(optval, compat_str)
 560     return optval
 561
 562 def formatSeconds(secs):
 563     if secs > 3600:
 564         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 565     elif secs > 60:
 566         return '%d:%02d' % (secs // 60, secs % 60)
 567     else:
 568         return '%d' % secs
 569
 570
 571 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 572     if sys.version_info < (3, 2):
 573         import httplib
 574
 575         class HTTPSConnectionV3(httplib.HTTPSConnection):
 576             def __init__(self, *args, **kwargs):
 577                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 578
 579             def connect(self):
 580                 sock = socket.create_connection((self.host, self.port), self.timeout)
 581                 if getattr(self, '_tunnel_host', False):
 582                     self.sock = sock
 583                     self._tunnel()
 584                 try:
 585                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 586                 except ssl.SSLError:
 587                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 588
 589         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 590             def https_open(self, req):
 591                 return self.do_open(HTTPSConnectionV3, req)
 592         return HTTPSHandlerV3(**kwargs)
 593     else:
 594         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 595         context.verify_mode = (ssl.CERT_NONE
 596                                if opts_no_check_certificate
 597                                else ssl.CERT_REQUIRED)
 598         context.set_default_verify_paths()
 599         try:
 600             context.load_default_certs()
 601         except AttributeError:
 602             pass  # Python < 3.4
 603         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 604
 605 class ExtractorError(Exception):
 606     """Error during info extraction."""
 607     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 608         """ tb, if given, is the original traceback (so that it can be printed out).
 609         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 610         """
 611
 612         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 613             expected = True
 614         if video_id is not None:
 615             msg = video_id + ': ' + msg
 616         if not expected:
 617             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 618         super(ExtractorError, self).__init__(msg)
 619
 620         self.traceback = tb
 621         self.exc_info = sys.exc_info()  # preserve original exception
 622         self.cause = cause
 623         self.video_id = video_id
 624
 625     def format_traceback(self):
 626         if self.traceback is None:
 627             return None
 628         return u''.join(traceback.format_tb(self.traceback))
 629
 630
 631 class RegexNotFoundError(ExtractorError):
 632     """Error when a regex didn't match"""
 633     pass
 634
 635
 636 class DownloadError(Exception):
 637     """Download Error exception.
 638
 639     This exception may be thrown by FileDownloader objects if they are not
 640     configured to continue on errors. They will contain the appropriate
 641     error message.
 642     """
 643     def __init__(self, msg, exc_info=None):
 644         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 645         super(DownloadError, self).__init__(msg)
 646         self.exc_info = exc_info
 647
 648
 649 class SameFileError(Exception):
 650     """Same File exception.
 651
 652     This exception will be thrown by FileDownloader objects if they detect
 653     multiple files would have to be downloaded to the same file on disk.
 654     """
 655     pass
 656
 657
 658 class PostProcessingError(Exception):
 659     """Post Processing exception.
 660
 661     This exception may be raised by PostProcessor's .run() method to
 662     indicate an error in the postprocessing task.
 663     """
 664     def __init__(self, msg):
 665         self.msg = msg
 666
 667 class MaxDownloadsReached(Exception):
 668     """ --max-downloads limit has been reached. """
 669     pass
 670
 671
 672 class UnavailableVideoError(Exception):
 673     """Unavailable Format exception.
 674
 675     This exception will be thrown when a video is requested
 676     in a format that is not available for that video.
 677     """
 678     pass
 679
 680
 681 class ContentTooShortError(Exception):
 682     """Content Too Short exception.
 683
 684     This exception may be raised by FileDownloader objects when a file they
 685     download is too small for what the server announced first, indicating
 686     the connection was probably interrupted.
 687     """
 688     # Both in bytes
 689     downloaded = None
 690     expected = None
 691
 692     def __init__(self, downloaded, expected):
 693         self.downloaded = downloaded
 694         self.expected = expected
 695
 696 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 697     """Handler for HTTP requests and responses.
 698
 699     This class, when installed with an OpenerDirector, automatically adds
 700     the standard headers to every HTTP request and handles gzipped and
 701     deflated responses from web servers. If compression is to be avoided in
 702     a particular request, the original request in the program code only has
 703     to include the HTTP header "Youtubedl-No-Compression", which will be
 704     removed before making the real request.
 705
 706     Part of this code was copied from:
 707
 708     http://techknack.net/python-urllib2-handlers/
 709
 710     Andrew Rowls, the author of that code, agreed to release it to the
 711     public domain.
 712     """
 713
 714     @staticmethod
 715     def deflate(data):
 716         try:
 717             return zlib.decompress(data, -zlib.MAX_WBITS)
 718         except zlib.error:
 719             return zlib.decompress(data)
 720
 721     @staticmethod
 722     def addinfourl_wrapper(stream, headers, url, code):
 723         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 724             return compat_urllib_request.addinfourl(stream, headers, url, code)
 725         ret = compat_urllib_request.addinfourl(stream, headers, url)
 726         ret.code = code
 727         return ret
 728
 729     def http_request(self, req):
 730         for h,v in std_headers.items():
 731             if h in req.headers:
 732                 del req.headers[h]
 733             req.add_header(h, v)
 734         if 'Youtubedl-no-compression' in req.headers:
 735             if 'Accept-encoding' in req.headers:
 736                 del req.headers['Accept-encoding']
 737             del req.headers['Youtubedl-no-compression']
 738         if 'Youtubedl-user-agent' in req.headers:
 739             if 'User-agent' in req.headers:
 740                 del req.headers['User-agent']
 741             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 742             del req.headers['Youtubedl-user-agent']
 743         return req
 744
 745     def http_response(self, req, resp):
 746         old_resp = resp
 747         # gzip
 748         if resp.headers.get('Content-encoding', '') == 'gzip':
 749             content = resp.read()
 750             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 751             try:
 752                 uncompressed = io.BytesIO(gz.read())
 753             except IOError as original_ioerror:
 754                 # There may be junk add the end of the file
 755                 # See http://stackoverflow.com/q/4928560/35070 for details
 756                 for i in range(1, 1024):
 757                     try:
 758                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 759                         uncompressed = io.BytesIO(gz.read())
 760                     except IOError:
 761                         continue
 762                     break
 763                 else:
 764                     raise original_ioerror
 765             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 766             resp.msg = old_resp.msg
 767         # deflate
 768         if resp.headers.get('Content-encoding', '') == 'deflate':
 769             gz = io.BytesIO(self.deflate(resp.read()))
 770             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 771             resp.msg = old_resp.msg
 772         return resp
 773
 774     https_request = http_request
 775     https_response = http_response
 776
 777
 778 def parse_iso8601(date_str, delimiter='T'):
 779     """ Return a UNIX timestamp from the given date """
 780
 781     if date_str is None:
 782         return None
 783
 784     m = re.search(
 785         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 786         date_str)
 787     if not m:
 788         timezone = datetime.timedelta()
 789     else:
 790         date_str = date_str[:-len(m.group(0))]
 791         if not m.group('sign'):
 792             timezone = datetime.timedelta()
 793         else:
 794             sign = 1 if m.group('sign') == '+' else -1
 795             timezone = datetime.timedelta(
 796                 hours=sign * int(m.group('hours')),
 797                 minutes=sign * int(m.group('minutes')))
 798     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 799     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 800     return calendar.timegm(dt.timetuple())
 801
 802
 803 def unified_strdate(date_str):
 804     """Return a string with the date in the format YYYYMMDD"""
 805
 806     if date_str is None:
 807         return None
 808
 809     upload_date = None
 810     #Replace commas
 811     date_str = date_str.replace(',', ' ')
 812     # %z (UTC offset) is only supported in python>=3.2
 813     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 814     format_expressions = [
 815         '%d %B %Y',
 816         '%d %b %Y',
 817         '%B %d %Y',
 818         '%b %d %Y',
 819         '%b %dst %Y %I:%M%p',
 820         '%b %dnd %Y %I:%M%p',
 821         '%b %dth %Y %I:%M%p',
 822         '%Y-%m-%d',
 823         '%d.%m.%Y',
 824         '%d/%m/%Y',
 825         '%Y/%m/%d %H:%M:%S',
 826         '%Y-%m-%d %H:%M:%S',
 827         '%d.%m.%Y %H:%M',
 828         '%d.%m.%Y %H.%M',
 829         '%Y-%m-%dT%H:%M:%SZ',
 830         '%Y-%m-%dT%H:%M:%S.%fZ',
 831         '%Y-%m-%dT%H:%M:%S.%f0Z',
 832         '%Y-%m-%dT%H:%M:%S',
 833         '%Y-%m-%dT%H:%M:%S.%f',
 834         '%Y-%m-%dT%H:%M',
 835     ]
 836     for expression in format_expressions:
 837         try:
 838             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 839         except ValueError:
 840             pass
 841     if upload_date is None:
 842         timetuple = email.utils.parsedate_tz(date_str)
 843         if timetuple:
 844             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 845     return upload_date
 846
 847 def determine_ext(url, default_ext=u'unknown_video'):
 848     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 849     if re.match(r'^[A-Za-z0-9]+$', guess):
 850         return guess
 851     else:
 852         return default_ext
 853
 854 def subtitles_filename(filename, sub_lang, sub_format):
 855     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 856
 857 def date_from_str(date_str):
 858     """
 859     Return a datetime object from a string in the format YYYYMMDD or
 860     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 861     today = datetime.date.today()
 862     if date_str == 'now'or date_str == 'today':
 863         return today
 864     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 865     if match is not None:
 866         sign = match.group('sign')
 867         time = int(match.group('time'))
 868         if sign == '-':
 869             time = -time
 870         unit = match.group('unit')
 871         #A bad aproximation?
 872         if unit == 'month':
 873             unit = 'day'
 874             time *= 30
 875         elif unit == 'year':
 876             unit = 'day'
 877             time *= 365
 878         unit += 's'
 879         delta = datetime.timedelta(**{unit: time})
 880         return today + delta
 881     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 882
 883 def hyphenate_date(date_str):
 884     """
 885     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 886     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 887     if match is not None:
 888         return '-'.join(match.groups())
 889     else:
 890         return date_str
 891
 892 class DateRange(object):
 893     """Represents a time interval between two dates"""
 894     def __init__(self, start=None, end=None):
 895         """start and end must be strings in the format accepted by date"""
 896         if start is not None:
 897             self.start = date_from_str(start)
 898         else:
 899             self.start = datetime.datetime.min.date()
 900         if end is not None:
 901             self.end = date_from_str(end)
 902         else:
 903             self.end = datetime.datetime.max.date()
 904         if self.start > self.end:
 905             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 906     @classmethod
 907     def day(cls, day):
 908         """Returns a range that only contains the given day"""
 909         return cls(day,day)
 910     def __contains__(self, date):
 911         """Check if the date is in the range"""
 912         if not isinstance(date, datetime.date):
 913             date = date_from_str(date)
 914         return self.start <= date <= self.end
 915     def __str__(self):
 916         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 917
 918
 919 def platform_name():
 920     """ Returns the platform name as a compat_str """
 921     res = platform.platform()
 922     if isinstance(res, bytes):
 923         res = res.decode(preferredencoding())
 924
 925     assert isinstance(res, compat_str)
 926     return res
 927
 928
 929 def _windows_write_string(s, out):
 930     """ Returns True if the string was written using special methods,
 931     False if it has yet to be written out."""
 932     # Adapted from http://stackoverflow.com/a/3259271/35070
 933
 934     import ctypes
 935     import ctypes.wintypes
 936
 937     WIN_OUTPUT_IDS = {
 938         1: -11,
 939         2: -12,
 940     }
 941
 942     try:
 943         fileno = out.fileno()
 944     except AttributeError:
 945         # If the output stream doesn't have a fileno, it's virtual
 946         return False
 947     if fileno not in WIN_OUTPUT_IDS:
 948         return False
 949
 950     GetStdHandle = ctypes.WINFUNCTYPE(
 951         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 952         ("GetStdHandle", ctypes.windll.kernel32))
 953     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 954
 955     WriteConsoleW = ctypes.WINFUNCTYPE(
 956         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 957         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 958         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 959     written = ctypes.wintypes.DWORD(0)
 960
 961     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 962     FILE_TYPE_CHAR = 0x0002
 963     FILE_TYPE_REMOTE = 0x8000
 964     GetConsoleMode = ctypes.WINFUNCTYPE(
 965         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 966         ctypes.POINTER(ctypes.wintypes.DWORD))(
 967         ("GetConsoleMode", ctypes.windll.kernel32))
 968     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 969
 970     def not_a_console(handle):
 971         if handle == INVALID_HANDLE_VALUE or handle is None:
 972             return True
 973         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 974                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 975
 976     if not_a_console(h):
 977         return False
 978
 979     def next_nonbmp_pos(s):
 980         try:
 981             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 982         except StopIteration:
 983             return len(s)
 984
 985     while s:
 986         count = min(next_nonbmp_pos(s), 1024)
 987
 988         ret = WriteConsoleW(
 989             h, s, count if count else 2, ctypes.byref(written), None)
 990         if ret == 0:
 991             raise OSError('Failed to write string')
 992         if not count:  # We just wrote a non-BMP character
 993             assert written.value == 2
 994             s = s[1:]
 995         else:
 996             assert written.value > 0
 997             s = s[written.value:]
 998     return True
 999
1000
1001 def write_string(s, out=None, encoding=None):
1002     if out is None:
1003         out = sys.stderr
1004     assert type(s) == compat_str
1005
1006     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1007         if _windows_write_string(s, out):
1008             return
1009
1010     if ('b' in getattr(out, 'mode', '') or
1011             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1012         byt = s.encode(encoding or preferredencoding(), 'ignore')
1013         out.write(byt)
1014     elif hasattr(out, 'buffer'):
1015         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1016         byt = s.encode(enc, 'ignore')
1017         out.buffer.write(byt)
1018     else:
1019         out.write(s)
1020     out.flush()
1021
1022
1023 def bytes_to_intlist(bs):
1024     if not bs:
1025         return []
1026     if isinstance(bs[0], int):  # Python 3
1027         return list(bs)
1028     else:
1029         return [ord(c) for c in bs]
1030
1031
1032 def intlist_to_bytes(xs):
1033     if not xs:
1034         return b''
1035     if isinstance(chr(0), bytes):  # Python 2
1036         return ''.join([chr(x) for x in xs])
1037     else:
1038         return bytes(xs)
1039
1040
1041 def get_cachedir(params={}):
1042     cache_root = os.environ.get('XDG_CACHE_HOME',
1043                                 os.path.expanduser('~/.cache'))
1044     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1045
1046
1047 # Cross-platform file locking
1048 if sys.platform == 'win32':
1049     import ctypes.wintypes
1050     import msvcrt
1051
1052     class OVERLAPPED(ctypes.Structure):
1053         _fields_ = [
1054             ('Internal', ctypes.wintypes.LPVOID),
1055             ('InternalHigh', ctypes.wintypes.LPVOID),
1056             ('Offset', ctypes.wintypes.DWORD),
1057             ('OffsetHigh', ctypes.wintypes.DWORD),
1058             ('hEvent', ctypes.wintypes.HANDLE),
1059         ]
1060
1061     kernel32 = ctypes.windll.kernel32
1062     LockFileEx = kernel32.LockFileEx
1063     LockFileEx.argtypes = [
1064         ctypes.wintypes.HANDLE,     # hFile
1065         ctypes.wintypes.DWORD,      # dwFlags
1066         ctypes.wintypes.DWORD,      # dwReserved
1067         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1068         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1069         ctypes.POINTER(OVERLAPPED)  # Overlapped
1070     ]
1071     LockFileEx.restype = ctypes.wintypes.BOOL
1072     UnlockFileEx = kernel32.UnlockFileEx
1073     UnlockFileEx.argtypes = [
1074         ctypes.wintypes.HANDLE,     # hFile
1075         ctypes.wintypes.DWORD,      # dwReserved
1076         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1077         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1078         ctypes.POINTER(OVERLAPPED)  # Overlapped
1079     ]
1080     UnlockFileEx.restype = ctypes.wintypes.BOOL
1081     whole_low = 0xffffffff
1082     whole_high = 0x7fffffff
1083
1084     def _lock_file(f, exclusive):
1085         overlapped = OVERLAPPED()
1086         overlapped.Offset = 0
1087         overlapped.OffsetHigh = 0
1088         overlapped.hEvent = 0
1089         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1090         handle = msvcrt.get_osfhandle(f.fileno())
1091         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1092                           whole_low, whole_high, f._lock_file_overlapped_p):
1093             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1094
1095     def _unlock_file(f):
1096         assert f._lock_file_overlapped_p
1097         handle = msvcrt.get_osfhandle(f.fileno())
1098         if not UnlockFileEx(handle, 0,
1099                             whole_low, whole_high, f._lock_file_overlapped_p):
1100             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1101
1102 else:
1103     import fcntl
1104
1105     def _lock_file(f, exclusive):
1106         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1107
1108     def _unlock_file(f):
1109         fcntl.lockf(f, fcntl.LOCK_UN)
1110
1111
1112 class locked_file(object):
1113     def __init__(self, filename, mode, encoding=None):
1114         assert mode in ['r', 'a', 'w']
1115         self.f = io.open(filename, mode, encoding=encoding)
1116         self.mode = mode
1117
1118     def __enter__(self):
1119         exclusive = self.mode != 'r'
1120         try:
1121             _lock_file(self.f, exclusive)
1122         except IOError:
1123             self.f.close()
1124             raise
1125         return self
1126
1127     def __exit__(self, etype, value, traceback):
1128         try:
1129             _unlock_file(self.f)
1130         finally:
1131             self.f.close()
1132
1133     def __iter__(self):
1134         return iter(self.f)
1135
1136     def write(self, *args):
1137         return self.f.write(*args)
1138
1139     def read(self, *args):
1140         return self.f.read(*args)
1141
1142
1143 def shell_quote(args):
1144     quoted_args = []
1145     encoding = sys.getfilesystemencoding()
1146     if encoding is None:
1147         encoding = 'utf-8'
1148     for a in args:
1149         if isinstance(a, bytes):
1150             # We may get a filename encoded with 'encodeFilename'
1151             a = a.decode(encoding)
1152         quoted_args.append(pipes.quote(a))
1153     return u' '.join(quoted_args)
1154
1155
1156 def takewhile_inclusive(pred, seq):
1157     """ Like itertools.takewhile, but include the latest evaluated element
1158         (the first element so that Not pred(e)) """
1159     for e in seq:
1160         yield e
1161         if not pred(e):
1162             return
1163
1164
1165 def smuggle_url(url, data):
1166     """ Pass additional data in a URL for internal use. """
1167
1168     sdata = compat_urllib_parse.urlencode(
1169         {u'__youtubedl_smuggle': json.dumps(data)})
1170     return url + u'#' + sdata
1171
1172
1173 def unsmuggle_url(smug_url, default=None):
1174     if not '#__youtubedl_smuggle' in smug_url:
1175         return smug_url, default
1176     url, _, sdata = smug_url.rpartition(u'#')
1177     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1178     data = json.loads(jsond)
1179     return url, data
1180
1181
1182 def format_bytes(bytes):
1183     if bytes is None:
1184         return u'N/A'
1185     if type(bytes) is str:
1186         bytes = float(bytes)
1187     if bytes == 0.0:
1188         exponent = 0
1189     else:
1190         exponent = int(math.log(bytes, 1024.0))
1191     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1192     converted = float(bytes) / float(1024 ** exponent)
1193     return u'%.2f%s' % (converted, suffix)
1194
1195
1196 def str_to_int(int_str):
1197     int_str = re.sub(r'[,\.]', u'', int_str)
1198     return int(int_str)
1199
1200
1201 def get_term_width():
1202     columns = os.environ.get('COLUMNS', None)
1203     if columns:
1204         return int(columns)
1205
1206     try:
1207         sp = subprocess.Popen(
1208             ['stty', 'size'],
1209             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1210         out, err = sp.communicate()
1211         return int(out.split()[1])
1212     except:
1213         pass
1214     return None
1215
1216
1217 def month_by_name(name):
1218     """ Return the number of a month by (locale-independently) English name """
1219
1220     ENGLISH_NAMES = [
1221         u'January', u'February', u'March', u'April', u'May', u'June',
1222         u'July', u'August', u'September', u'October', u'November', u'December']
1223     try:
1224         return ENGLISH_NAMES.index(name) + 1
1225     except ValueError:
1226         return None
1227
1228
1229 def fix_xml_ampersands(xml_str):
1230     """Replace all the '&' by '&amp;' in XML"""
1231     return re.sub(
1232         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1233         u'&amp;',
1234         xml_str)
1235
1236
1237 def setproctitle(title):
1238     assert isinstance(title, compat_str)
1239     try:
1240         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1241     except OSError:
1242         return
1243     title_bytes = title.encode('utf-8')
1244     buf = ctypes.create_string_buffer(len(title_bytes))
1245     buf.value = title_bytes
1246     try:
1247         libc.prctl(15, buf, 0, 0, 0)
1248     except AttributeError:
1249         return  # Strange libc, just skip this
1250
1251
1252 def remove_start(s, start):
1253     if s.startswith(start):
1254         return s[len(start):]
1255     return s
1256
1257
1258 def url_basename(url):
1259     path = compat_urlparse.urlparse(url).path
1260     return path.strip(u'/').split(u'/')[-1]
1261
1262
1263 class HEADRequest(compat_urllib_request.Request):
1264     def get_method(self):
1265         return "HEAD"
1266
1267
1268 def int_or_none(v, scale=1, default=None, get_attr=None):
1269     if get_attr:
1270         if v is not None:
1271             v = getattr(v, get_attr, None)
1272     return default if v is None else (int(v) // scale)
1273
1274
1275 def float_or_none(v, scale=1, default=None):
1276     return default if v is None else (float(v) / scale)
1277
1278
1279 def parse_duration(s):
1280     if s is None:
1281         return None
1282
1283     m = re.match(
1284         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1285     if not m:
1286         return None
1287     res = int(m.group('secs'))
1288     if m.group('mins'):
1289         res += int(m.group('mins')) * 60
1290         if m.group('hours'):
1291             res += int(m.group('hours')) * 60 * 60
1292     return res
1293
1294
1295 def prepend_extension(filename, ext):
1296     name, real_ext = os.path.splitext(filename)
1297     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1298
1299
1300 def check_executable(exe, args=[]):
1301     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1302     args can be a list of arguments for a short output (like -version) """
1303     try:
1304         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1305     except OSError:
1306         return False
1307     return exe
1308
1309
1310 class PagedList(object):
1311     def __init__(self, pagefunc, pagesize):
1312         self._pagefunc = pagefunc
1313         self._pagesize = pagesize
1314
1315     def __len__(self):
1316         # This is only useful for tests
1317         return len(self.getslice())
1318
1319     def getslice(self, start=0, end=None):
1320         res = []
1321         for pagenum in itertools.count(start // self._pagesize):
1322             firstid = pagenum * self._pagesize
1323             nextfirstid = pagenum * self._pagesize + self._pagesize
1324             if start >= nextfirstid:
1325                 continue
1326
1327             page_results = list(self._pagefunc(pagenum))
1328
1329             startv = (
1330                 start % self._pagesize
1331                 if firstid <= start < nextfirstid
1332                 else 0)
1333
1334             endv = (
1335                 ((end - 1) % self._pagesize) + 1
1336                 if (end is not None and firstid <= end <= nextfirstid)
1337                 else None)
1338
1339             if startv != 0 or endv is not None:
1340                 page_results = page_results[startv:endv]
1341             res.extend(page_results)
1342
1343             # A little optimization - if current page is not "full", ie. does
1344             # not contain page_size videos then we can assume that this page
1345             # is the last one - there are no more ids on further pages -
1346             # i.e. no need to query again.
1347             if len(page_results) + startv < self._pagesize:
1348                 break
1349
1350             # If we got the whole page, but the next page is not interesting,
1351             # break out early as well
1352             if end == nextfirstid:
1353                 break
1354         return res
1355
1356
1357 def uppercase_escape(s):
1358     unicode_escape = codecs.getdecoder('unicode_escape')
1359     return re.sub(
1360         r'\\U[0-9a-fA-F]{8}',
1361         lambda m: unicode_escape(m.group(0))[0],
1362         s)
1363
1364 try:
1365     struct.pack(u'!I', 0)
1366 except TypeError:
1367     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1368     def struct_pack(spec, *args):
1369         if isinstance(spec, compat_str):
1370             spec = spec.encode('ascii')
1371         return struct.pack(spec, *args)
1372
1373     def struct_unpack(spec, *args):
1374         if isinstance(spec, compat_str):
1375             spec = spec.encode('ascii')
1376         return struct.unpack(spec, *args)
1377 else:
1378     struct_pack = struct.pack
1379     struct_unpack = struct.unpack
1380
1381
1382 def read_batch_urls(batch_fd):
1383     def fixup(url):
1384         if not isinstance(url, compat_str):
1385             url = url.decode('utf-8', 'replace')
1386         BOM_UTF8 = u'\xef\xbb\xbf'
1387         if url.startswith(BOM_UTF8):
1388             url = url[len(BOM_UTF8):]
1389         url = url.strip()
1390         if url.startswith(('#', ';', ']')):
1391             return False
1392         return url
1393
1394     with contextlib.closing(batch_fd) as fd:
1395         return [url for url in map(fixup, fd) if url]
1396
1397
1398 def urlencode_postdata(*args, **kargs):
1399     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1400
1401
1402 def parse_xml(s):
1403     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1404         def doctype(self, name, pubid, system):
1405             pass  # Ignore doctypes
1406
1407     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1408     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1409     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1410
1411
1412 if sys.version_info < (3, 0) and sys.platform == 'win32':
1413     def compat_getpass(prompt, *args, **kwargs):
1414         if isinstance(prompt, compat_str):
1415             prompt = prompt.encode(preferredencoding())
1416         return getpass.getpass(prompt, *args, **kwargs)
1417 else:
1418     compat_getpass = getpass.getpass
1419
1420
1421 US_RATINGS = {
1422     'G': 0,
1423     'PG': 10,
1424     'PG-13': 13,
1425     'R': 16,
1426     'NC': 18,
1427 }
1428
1429
1430 def strip_jsonp(code):
1431     return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
1432
1433
1434 def qualities(quality_ids):
1435     """ Get a numeric quality value out of a list of possible values """
1436     def q(qid):
1437         try:
1438             return quality_ids.index(qid)
1439         except ValueError:
1440             return -1
1441     return q
1442
1443
1444 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1445
1446 try:
1447     subprocess_check_output = subprocess.check_output
1448 except AttributeError:
1449     def subprocess_check_output(*args, **kwargs):
1450         assert 'input' not in kwargs
1451         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1452         output, _ = p.communicate()
1453         ret = p.poll()
1454         if ret:
1455             raise subprocess.CalledProcessError(ret, p.args, output=output)
1456         return output