youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 from .compat import (
  33     compat_chr,
  34     compat_getenv,
  35     compat_html_entities,
  36     compat_html_parser,
  37     compat_parse_qs,
  38     compat_str,
  39     compat_urllib_error,
  40     compat_urllib_parse,
  41     compat_urllib_parse_urlparse,
  42     compat_urllib_request,
  43     compat_urlparse,
  44 )
  45
  46
  47 # This is not clearly defined otherwise
  48 compiled_regex_type = type(re.compile(''))
  49
  50 std_headers = {
  51     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  52     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  53     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  54     'Accept-Encoding': 'gzip, deflate',
  55     'Accept-Language': 'en-us,en;q=0.5',
  56 }
  57
  58 def preferredencoding():
  59     """Get preferred encoding.
  60
  61     Returns the best encoding scheme for the system, based on
  62     locale.getpreferredencoding() and some further tweaks.
  63     """
  64     try:
  65         pref = locale.getpreferredencoding()
  66         u'TEST'.encode(pref)
  67     except:
  68         pref = 'UTF-8'
  69
  70     return pref
  71
  72
  73 def write_json_file(obj, fn):
  74     """ Encode obj as JSON and write it to fn, atomically """
  75
  76     args = {
  77         'suffix': '.tmp',
  78         'prefix': os.path.basename(fn) + '.',
  79         'dir': os.path.dirname(fn),
  80         'delete': False,
  81     }
  82
  83     # In Python 2.x, json.dump expects a bytestream.
  84     # In Python 3.x, it writes to a character stream
  85     if sys.version_info < (3, 0):
  86         args['mode'] = 'wb'
  87     else:
  88         args.update({
  89             'mode': 'w',
  90             'encoding': 'utf-8',
  91         })
  92
  93     tf = tempfile.NamedTemporaryFile(**args)
  94
  95     try:
  96         with tf:
  97             json.dump(obj, tf)
  98         os.rename(tf.name, fn)
  99     except:
 100         try:
 101             os.remove(tf.name)
 102         except OSError:
 103             pass
 104         raise
 105
 106
 107 if sys.version_info >= (2, 7):
 108     def find_xpath_attr(node, xpath, key, val):
 109         """ Find the xpath xpath[@key=val] """
 110         assert re.match(r'^[a-zA-Z-]+$', key)
 111         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 112         expr = xpath + u"[@%s='%s']" % (key, val)
 113         return node.find(expr)
 114 else:
 115     def find_xpath_attr(node, xpath, key, val):
 116         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 117         # .//node does not match if a node is a direct child of . !
 118         if isinstance(xpath, unicode):
 119             xpath = xpath.encode('ascii')
 120
 121         for f in node.findall(xpath):
 122             if f.attrib.get(key) == val:
 123                 return f
 124         return None
 125
 126 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 127 # the namespace parameter
 128 def xpath_with_ns(path, ns_map):
 129     components = [c.split(':') for c in path.split('/')]
 130     replaced = []
 131     for c in components:
 132         if len(c) == 1:
 133             replaced.append(c[0])
 134         else:
 135             ns, tag = c
 136             replaced.append('{%s}%s' % (ns_map[ns], tag))
 137     return '/'.join(replaced)
 138
 139
 140 def xpath_text(node, xpath, name=None, fatal=False):
 141     if sys.version_info < (2, 7):  # Crazy 2.6
 142         xpath = xpath.encode('ascii')
 143
 144     n = node.find(xpath)
 145     if n is None:
 146         if fatal:
 147             name = xpath if name is None else name
 148             raise ExtractorError('Could not find XML element %s' % name)
 149         else:
 150             return None
 151     return n.text
 152
 153
 154 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 155 class BaseHTMLParser(compat_html_parser.HTMLParser):
 156     def __init(self):
 157         compat_html_parser.HTMLParser.__init__(self)
 158         self.html = None
 159
 160     def loads(self, html):
 161         self.html = html
 162         self.feed(html)
 163         self.close()
 164
 165 class AttrParser(BaseHTMLParser):
 166     """Modified HTMLParser that isolates a tag with the specified attribute"""
 167     def __init__(self, attribute, value):
 168         self.attribute = attribute
 169         self.value = value
 170         self.result = None
 171         self.started = False
 172         self.depth = {}
 173         self.watch_startpos = False
 174         self.error_count = 0
 175         BaseHTMLParser.__init__(self)
 176
 177     def error(self, message):
 178         if self.error_count > 10 or self.started:
 179             raise compat_html_parser.HTMLParseError(message, self.getpos())
 180         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 181         self.error_count += 1
 182         self.goahead(1)
 183
 184     def handle_starttag(self, tag, attrs):
 185         attrs = dict(attrs)
 186         if self.started:
 187             self.find_startpos(None)
 188         if self.attribute in attrs and attrs[self.attribute] == self.value:
 189             self.result = [tag]
 190             self.started = True
 191             self.watch_startpos = True
 192         if self.started:
 193             if not tag in self.depth: self.depth[tag] = 0
 194             self.depth[tag] += 1
 195
 196     def handle_endtag(self, tag):
 197         if self.started:
 198             if tag in self.depth: self.depth[tag] -= 1
 199             if self.depth[self.result[0]] == 0:
 200                 self.started = False
 201                 self.result.append(self.getpos())
 202
 203     def find_startpos(self, x):
 204         """Needed to put the start position of the result (self.result[1])
 205         after the opening tag with the requested id"""
 206         if self.watch_startpos:
 207             self.watch_startpos = False
 208             self.result.append(self.getpos())
 209     handle_entityref = handle_charref = handle_data = handle_comment = \
 210     handle_decl = handle_pi = unknown_decl = find_startpos
 211
 212     def get_result(self):
 213         if self.result is None:
 214             return None
 215         if len(self.result) != 3:
 216             return None
 217         lines = self.html.split('\n')
 218         lines = lines[self.result[1][0]-1:self.result[2][0]]
 219         lines[0] = lines[0][self.result[1][1]:]
 220         if len(lines) == 1:
 221             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 222         lines[-1] = lines[-1][:self.result[2][1]]
 223         return '\n'.join(lines).strip()
 224 # Hack for https://github.com/rg3/youtube-dl/issues/662
 225 if sys.version_info < (2, 7, 3):
 226     AttrParser.parse_endtag = (lambda self, i:
 227         i + len("</scr'+'ipt>")
 228         if self.rawdata[i:].startswith("</scr'+'ipt>")
 229         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 230
 231 def get_element_by_id(id, html):
 232     """Return the content of the tag with the specified ID in the passed HTML document"""
 233     return get_element_by_attribute("id", id, html)
 234
 235 def get_element_by_attribute(attribute, value, html):
 236     """Return the content of the tag with the specified attribute in the passed HTML document"""
 237     parser = AttrParser(attribute, value)
 238     try:
 239         parser.loads(html)
 240     except compat_html_parser.HTMLParseError:
 241         pass
 242     return parser.get_result()
 243
 244 class MetaParser(BaseHTMLParser):
 245     """
 246     Modified HTMLParser that isolates a meta tag with the specified name
 247     attribute.
 248     """
 249     def __init__(self, name):
 250         BaseHTMLParser.__init__(self)
 251         self.name = name
 252         self.content = None
 253         self.result = None
 254
 255     def handle_starttag(self, tag, attrs):
 256         if tag != 'meta':
 257             return
 258         attrs = dict(attrs)
 259         if attrs.get('name') == self.name:
 260             self.result = attrs.get('content')
 261
 262     def get_result(self):
 263         return self.result
 264
 265 def get_meta_content(name, html):
 266     """
 267     Return the content attribute from the meta tag with the given name attribute.
 268     """
 269     parser = MetaParser(name)
 270     try:
 271         parser.loads(html)
 272     except compat_html_parser.HTMLParseError:
 273         pass
 274     return parser.get_result()
 275
 276
 277 def clean_html(html):
 278     """Clean an HTML snippet into a readable string"""
 279     # Newline vs <br />
 280     html = html.replace('\n', ' ')
 281     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 282     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 283     # Strip html tags
 284     html = re.sub('<.*?>', '', html)
 285     # Replace html entities
 286     html = unescapeHTML(html)
 287     return html.strip()
 288
 289
 290 def sanitize_open(filename, open_mode):
 291     """Try to open the given filename, and slightly tweak it if this fails.
 292
 293     Attempts to open the given filename. If this fails, it tries to change
 294     the filename slightly, step by step, until it's either able to open it
 295     or it fails and raises a final exception, like the standard open()
 296     function.
 297
 298     It returns the tuple (stream, definitive_file_name).
 299     """
 300     try:
 301         if filename == u'-':
 302             if sys.platform == 'win32':
 303                 import msvcrt
 304                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 305             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 306         stream = open(encodeFilename(filename), open_mode)
 307         return (stream, filename)
 308     except (IOError, OSError) as err:
 309         if err.errno in (errno.EACCES,):
 310             raise
 311
 312         # In case of error, try to remove win32 forbidden chars
 313         alt_filename = os.path.join(
 314                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 315                         for path_part in os.path.split(filename)
 316                        )
 317         if alt_filename == filename:
 318             raise
 319         else:
 320             # An exception here should be caught in the caller
 321             stream = open(encodeFilename(filename), open_mode)
 322             return (stream, alt_filename)
 323
 324
 325 def timeconvert(timestr):
 326     """Convert RFC 2822 defined time string into system timestamp"""
 327     timestamp = None
 328     timetuple = email.utils.parsedate_tz(timestr)
 329     if timetuple is not None:
 330         timestamp = email.utils.mktime_tz(timetuple)
 331     return timestamp
 332
 333 def sanitize_filename(s, restricted=False, is_id=False):
 334     """Sanitizes a string so it could be used as part of a filename.
 335     If restricted is set, use a stricter subset of allowed characters.
 336     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 337     """
 338     def replace_insane(char):
 339         if char == '?' or ord(char) < 32 or ord(char) == 127:
 340             return ''
 341         elif char == '"':
 342             return '' if restricted else '\''
 343         elif char == ':':
 344             return '_-' if restricted else ' -'
 345         elif char in '\\/|*<>':
 346             return '_'
 347         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 348             return '_'
 349         if restricted and ord(char) > 127:
 350             return '_'
 351         return char
 352
 353     result = u''.join(map(replace_insane, s))
 354     if not is_id:
 355         while '__' in result:
 356             result = result.replace('__', '_')
 357         result = result.strip('_')
 358         # Common case of "Foreign band name - English song title"
 359         if restricted and result.startswith('-_'):
 360             result = result[2:]
 361         if not result:
 362             result = '_'
 363     return result
 364
 365 def orderedSet(iterable):
 366     """ Remove all duplicates from the input iterable """
 367     res = []
 368     for el in iterable:
 369         if el not in res:
 370             res.append(el)
 371     return res
 372
 373
 374 def _htmlentity_transform(entity):
 375     """Transforms an HTML entity to a character."""
 376     # Known non-numeric HTML entity
 377     if entity in compat_html_entities.name2codepoint:
 378         return compat_chr(compat_html_entities.name2codepoint[entity])
 379
 380     mobj = re.match(r'#(x?[0-9]+)', entity)
 381     if mobj is not None:
 382         numstr = mobj.group(1)
 383         if numstr.startswith(u'x'):
 384             base = 16
 385             numstr = u'0%s' % numstr
 386         else:
 387             base = 10
 388         return compat_chr(int(numstr, base))
 389
 390     # Unknown entity in name, return its literal representation
 391     return (u'&%s;' % entity)
 392
 393
 394 def unescapeHTML(s):
 395     if s is None:
 396         return None
 397     assert type(s) == compat_str
 398
 399     return re.sub(
 400         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 401
 402
 403 def encodeFilename(s, for_subprocess=False):
 404     """
 405     @param s The name of the file
 406     """
 407
 408     assert type(s) == compat_str
 409
 410     # Python 3 has a Unicode API
 411     if sys.version_info >= (3, 0):
 412         return s
 413
 414     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 415         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 416         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 417         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 418         if not for_subprocess:
 419             return s
 420         else:
 421             # For subprocess calls, encode with locale encoding
 422             # Refer to http://stackoverflow.com/a/9951851/35070
 423             encoding = preferredencoding()
 424     else:
 425         encoding = sys.getfilesystemencoding()
 426     if encoding is None:
 427         encoding = 'utf-8'
 428     return s.encode(encoding, 'ignore')
 429
 430
 431 def encodeArgument(s):
 432     if not isinstance(s, compat_str):
 433         # Legacy code that uses byte strings
 434         # Uncomment the following line after fixing all post processors
 435         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 436         s = s.decode('ascii')
 437     return encodeFilename(s, True)
 438
 439
 440 def decodeOption(optval):
 441     if optval is None:
 442         return optval
 443     if isinstance(optval, bytes):
 444         optval = optval.decode(preferredencoding())
 445
 446     assert isinstance(optval, compat_str)
 447     return optval
 448
 449 def formatSeconds(secs):
 450     if secs > 3600:
 451         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 452     elif secs > 60:
 453         return '%d:%02d' % (secs // 60, secs % 60)
 454     else:
 455         return '%d' % secs
 456
 457
 458 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 459     if sys.version_info < (3, 2):
 460         import httplib
 461
 462         class HTTPSConnectionV3(httplib.HTTPSConnection):
 463             def __init__(self, *args, **kwargs):
 464                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 465
 466             def connect(self):
 467                 sock = socket.create_connection((self.host, self.port), self.timeout)
 468                 if getattr(self, '_tunnel_host', False):
 469                     self.sock = sock
 470                     self._tunnel()
 471                 try:
 472                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 473                 except ssl.SSLError:
 474                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 475
 476         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 477             def https_open(self, req):
 478                 return self.do_open(HTTPSConnectionV3, req)
 479         return HTTPSHandlerV3(**kwargs)
 480     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 481         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 482         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 483         if opts_no_check_certificate:
 484             context.verify_mode = ssl.CERT_NONE
 485         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 486     else:  # Python < 3.4
 487         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 488         context.verify_mode = (ssl.CERT_NONE
 489                                if opts_no_check_certificate
 490                                else ssl.CERT_REQUIRED)
 491         context.set_default_verify_paths()
 492         try:
 493             context.load_default_certs()
 494         except AttributeError:
 495             pass  # Python < 3.4
 496         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 497
 498 class ExtractorError(Exception):
 499     """Error during info extraction."""
 500     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 501         """ tb, if given, is the original traceback (so that it can be printed out).
 502         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 503         """
 504
 505         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 506             expected = True
 507         if video_id is not None:
 508             msg = video_id + ': ' + msg
 509         if cause:
 510             msg += u' (caused by %r)' % cause
 511         if not expected:
 512             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 513         super(ExtractorError, self).__init__(msg)
 514
 515         self.traceback = tb
 516         self.exc_info = sys.exc_info()  # preserve original exception
 517         self.cause = cause
 518         self.video_id = video_id
 519
 520     def format_traceback(self):
 521         if self.traceback is None:
 522             return None
 523         return u''.join(traceback.format_tb(self.traceback))
 524
 525
 526 class RegexNotFoundError(ExtractorError):
 527     """Error when a regex didn't match"""
 528     pass
 529
 530
 531 class DownloadError(Exception):
 532     """Download Error exception.
 533
 534     This exception may be thrown by FileDownloader objects if they are not
 535     configured to continue on errors. They will contain the appropriate
 536     error message.
 537     """
 538     def __init__(self, msg, exc_info=None):
 539         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 540         super(DownloadError, self).__init__(msg)
 541         self.exc_info = exc_info
 542
 543
 544 class SameFileError(Exception):
 545     """Same File exception.
 546
 547     This exception will be thrown by FileDownloader objects if they detect
 548     multiple files would have to be downloaded to the same file on disk.
 549     """
 550     pass
 551
 552
 553 class PostProcessingError(Exception):
 554     """Post Processing exception.
 555
 556     This exception may be raised by PostProcessor's .run() method to
 557     indicate an error in the postprocessing task.
 558     """
 559     def __init__(self, msg):
 560         self.msg = msg
 561
 562 class MaxDownloadsReached(Exception):
 563     """ --max-downloads limit has been reached. """
 564     pass
 565
 566
 567 class UnavailableVideoError(Exception):
 568     """Unavailable Format exception.
 569
 570     This exception will be thrown when a video is requested
 571     in a format that is not available for that video.
 572     """
 573     pass
 574
 575
 576 class ContentTooShortError(Exception):
 577     """Content Too Short exception.
 578
 579     This exception may be raised by FileDownloader objects when a file they
 580     download is too small for what the server announced first, indicating
 581     the connection was probably interrupted.
 582     """
 583     # Both in bytes
 584     downloaded = None
 585     expected = None
 586
 587     def __init__(self, downloaded, expected):
 588         self.downloaded = downloaded
 589         self.expected = expected
 590
 591 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 592     """Handler for HTTP requests and responses.
 593
 594     This class, when installed with an OpenerDirector, automatically adds
 595     the standard headers to every HTTP request and handles gzipped and
 596     deflated responses from web servers. If compression is to be avoided in
 597     a particular request, the original request in the program code only has
 598     to include the HTTP header "Youtubedl-No-Compression", which will be
 599     removed before making the real request.
 600
 601     Part of this code was copied from:
 602
 603     http://techknack.net/python-urllib2-handlers/
 604
 605     Andrew Rowls, the author of that code, agreed to release it to the
 606     public domain.
 607     """
 608
 609     @staticmethod
 610     def deflate(data):
 611         try:
 612             return zlib.decompress(data, -zlib.MAX_WBITS)
 613         except zlib.error:
 614             return zlib.decompress(data)
 615
 616     @staticmethod
 617     def addinfourl_wrapper(stream, headers, url, code):
 618         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 619             return compat_urllib_request.addinfourl(stream, headers, url, code)
 620         ret = compat_urllib_request.addinfourl(stream, headers, url)
 621         ret.code = code
 622         return ret
 623
 624     def http_request(self, req):
 625         for h, v in std_headers.items():
 626             if h not in req.headers:
 627                 req.add_header(h, v)
 628         if 'Youtubedl-no-compression' in req.headers:
 629             if 'Accept-encoding' in req.headers:
 630                 del req.headers['Accept-encoding']
 631             del req.headers['Youtubedl-no-compression']
 632         if 'Youtubedl-user-agent' in req.headers:
 633             if 'User-agent' in req.headers:
 634                 del req.headers['User-agent']
 635             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 636             del req.headers['Youtubedl-user-agent']
 637
 638         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 639             # Python 2.6 is brain-dead when it comes to fragments
 640             req._Request__original = req._Request__original.partition('#')[0]
 641             req._Request__r_type = req._Request__r_type.partition('#')[0]
 642
 643         return req
 644
 645     def http_response(self, req, resp):
 646         old_resp = resp
 647         # gzip
 648         if resp.headers.get('Content-encoding', '') == 'gzip':
 649             content = resp.read()
 650             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 651             try:
 652                 uncompressed = io.BytesIO(gz.read())
 653             except IOError as original_ioerror:
 654                 # There may be junk add the end of the file
 655                 # See http://stackoverflow.com/q/4928560/35070 for details
 656                 for i in range(1, 1024):
 657                     try:
 658                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 659                         uncompressed = io.BytesIO(gz.read())
 660                     except IOError:
 661                         continue
 662                     break
 663                 else:
 664                     raise original_ioerror
 665             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 666             resp.msg = old_resp.msg
 667         # deflate
 668         if resp.headers.get('Content-encoding', '') == 'deflate':
 669             gz = io.BytesIO(self.deflate(resp.read()))
 670             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 671             resp.msg = old_resp.msg
 672         return resp
 673
 674     https_request = http_request
 675     https_response = http_response
 676
 677
 678 def parse_iso8601(date_str, delimiter='T'):
 679     """ Return a UNIX timestamp from the given date """
 680
 681     if date_str is None:
 682         return None
 683
 684     m = re.search(
 685         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 686         date_str)
 687     if not m:
 688         timezone = datetime.timedelta()
 689     else:
 690         date_str = date_str[:-len(m.group(0))]
 691         if not m.group('sign'):
 692             timezone = datetime.timedelta()
 693         else:
 694             sign = 1 if m.group('sign') == '+' else -1
 695             timezone = datetime.timedelta(
 696                 hours=sign * int(m.group('hours')),
 697                 minutes=sign * int(m.group('minutes')))
 698     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 699     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 700     return calendar.timegm(dt.timetuple())
 701
 702
 703 def unified_strdate(date_str):
 704     """Return a string with the date in the format YYYYMMDD"""
 705
 706     if date_str is None:
 707         return None
 708
 709     upload_date = None
 710     #Replace commas
 711     date_str = date_str.replace(',', ' ')
 712     # %z (UTC offset) is only supported in python>=3.2
 713     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 714     format_expressions = [
 715         '%d %B %Y',
 716         '%d %b %Y',
 717         '%B %d %Y',
 718         '%b %d %Y',
 719         '%b %dst %Y %I:%M%p',
 720         '%b %dnd %Y %I:%M%p',
 721         '%b %dth %Y %I:%M%p',
 722         '%Y-%m-%d',
 723         '%Y/%m/%d',
 724         '%d.%m.%Y',
 725         '%d/%m/%Y',
 726         '%d/%m/%y',
 727         '%Y/%m/%d %H:%M:%S',
 728         '%d/%m/%Y %H:%M:%S',
 729         '%Y-%m-%d %H:%M:%S',
 730         '%Y-%m-%d %H:%M:%S.%f',
 731         '%d.%m.%Y %H:%M',
 732         '%d.%m.%Y %H.%M',
 733         '%Y-%m-%dT%H:%M:%SZ',
 734         '%Y-%m-%dT%H:%M:%S.%fZ',
 735         '%Y-%m-%dT%H:%M:%S.%f0Z',
 736         '%Y-%m-%dT%H:%M:%S',
 737         '%Y-%m-%dT%H:%M:%S.%f',
 738         '%Y-%m-%dT%H:%M',
 739     ]
 740     for expression in format_expressions:
 741         try:
 742             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 743         except ValueError:
 744             pass
 745     if upload_date is None:
 746         timetuple = email.utils.parsedate_tz(date_str)
 747         if timetuple:
 748             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 749     return upload_date
 750
 751 def determine_ext(url, default_ext=u'unknown_video'):
 752     if url is None:
 753         return default_ext
 754     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 755     if re.match(r'^[A-Za-z0-9]+$', guess):
 756         return guess
 757     else:
 758         return default_ext
 759
 760 def subtitles_filename(filename, sub_lang, sub_format):
 761     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 762
 763 def date_from_str(date_str):
 764     """
 765     Return a datetime object from a string in the format YYYYMMDD or
 766     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 767     today = datetime.date.today()
 768     if date_str == 'now'or date_str == 'today':
 769         return today
 770     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 771     if match is not None:
 772         sign = match.group('sign')
 773         time = int(match.group('time'))
 774         if sign == '-':
 775             time = -time
 776         unit = match.group('unit')
 777         #A bad aproximation?
 778         if unit == 'month':
 779             unit = 'day'
 780             time *= 30
 781         elif unit == 'year':
 782             unit = 'day'
 783             time *= 365
 784         unit += 's'
 785         delta = datetime.timedelta(**{unit: time})
 786         return today + delta
 787     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 788
 789 def hyphenate_date(date_str):
 790     """
 791     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 792     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 793     if match is not None:
 794         return '-'.join(match.groups())
 795     else:
 796         return date_str
 797
 798 class DateRange(object):
 799     """Represents a time interval between two dates"""
 800     def __init__(self, start=None, end=None):
 801         """start and end must be strings in the format accepted by date"""
 802         if start is not None:
 803             self.start = date_from_str(start)
 804         else:
 805             self.start = datetime.datetime.min.date()
 806         if end is not None:
 807             self.end = date_from_str(end)
 808         else:
 809             self.end = datetime.datetime.max.date()
 810         if self.start > self.end:
 811             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 812     @classmethod
 813     def day(cls, day):
 814         """Returns a range that only contains the given day"""
 815         return cls(day,day)
 816     def __contains__(self, date):
 817         """Check if the date is in the range"""
 818         if not isinstance(date, datetime.date):
 819             date = date_from_str(date)
 820         return self.start <= date <= self.end
 821     def __str__(self):
 822         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 823
 824
 825 def platform_name():
 826     """ Returns the platform name as a compat_str """
 827     res = platform.platform()
 828     if isinstance(res, bytes):
 829         res = res.decode(preferredencoding())
 830
 831     assert isinstance(res, compat_str)
 832     return res
 833
 834
 835 def _windows_write_string(s, out):
 836     """ Returns True if the string was written using special methods,
 837     False if it has yet to be written out."""
 838     # Adapted from http://stackoverflow.com/a/3259271/35070
 839
 840     import ctypes
 841     import ctypes.wintypes
 842
 843     WIN_OUTPUT_IDS = {
 844         1: -11,
 845         2: -12,
 846     }
 847
 848     try:
 849         fileno = out.fileno()
 850     except AttributeError:
 851         # If the output stream doesn't have a fileno, it's virtual
 852         return False
 853     if fileno not in WIN_OUTPUT_IDS:
 854         return False
 855
 856     GetStdHandle = ctypes.WINFUNCTYPE(
 857         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 858         ("GetStdHandle", ctypes.windll.kernel32))
 859     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 860
 861     WriteConsoleW = ctypes.WINFUNCTYPE(
 862         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 863         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 864         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 865     written = ctypes.wintypes.DWORD(0)
 866
 867     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 868     FILE_TYPE_CHAR = 0x0002
 869     FILE_TYPE_REMOTE = 0x8000
 870     GetConsoleMode = ctypes.WINFUNCTYPE(
 871         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 872         ctypes.POINTER(ctypes.wintypes.DWORD))(
 873         ("GetConsoleMode", ctypes.windll.kernel32))
 874     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 875
 876     def not_a_console(handle):
 877         if handle == INVALID_HANDLE_VALUE or handle is None:
 878             return True
 879         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 880                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 881
 882     if not_a_console(h):
 883         return False
 884
 885     def next_nonbmp_pos(s):
 886         try:
 887             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 888         except StopIteration:
 889             return len(s)
 890
 891     while s:
 892         count = min(next_nonbmp_pos(s), 1024)
 893
 894         ret = WriteConsoleW(
 895             h, s, count if count else 2, ctypes.byref(written), None)
 896         if ret == 0:
 897             raise OSError('Failed to write string')
 898         if not count:  # We just wrote a non-BMP character
 899             assert written.value == 2
 900             s = s[1:]
 901         else:
 902             assert written.value > 0
 903             s = s[written.value:]
 904     return True
 905
 906
 907 def write_string(s, out=None, encoding=None):
 908     if out is None:
 909         out = sys.stderr
 910     assert type(s) == compat_str
 911
 912     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 913         if _windows_write_string(s, out):
 914             return
 915
 916     if ('b' in getattr(out, 'mode', '') or
 917             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 918         byt = s.encode(encoding or preferredencoding(), 'ignore')
 919         out.write(byt)
 920     elif hasattr(out, 'buffer'):
 921         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 922         byt = s.encode(enc, 'ignore')
 923         out.buffer.write(byt)
 924     else:
 925         out.write(s)
 926     out.flush()
 927
 928
 929 def bytes_to_intlist(bs):
 930     if not bs:
 931         return []
 932     if isinstance(bs[0], int):  # Python 3
 933         return list(bs)
 934     else:
 935         return [ord(c) for c in bs]
 936
 937
 938 def intlist_to_bytes(xs):
 939     if not xs:
 940         return b''
 941     if isinstance(chr(0), bytes):  # Python 2
 942         return ''.join([chr(x) for x in xs])
 943     else:
 944         return bytes(xs)
 945
 946
 947 # Cross-platform file locking
 948 if sys.platform == 'win32':
 949     import ctypes.wintypes
 950     import msvcrt
 951
 952     class OVERLAPPED(ctypes.Structure):
 953         _fields_ = [
 954             ('Internal', ctypes.wintypes.LPVOID),
 955             ('InternalHigh', ctypes.wintypes.LPVOID),
 956             ('Offset', ctypes.wintypes.DWORD),
 957             ('OffsetHigh', ctypes.wintypes.DWORD),
 958             ('hEvent', ctypes.wintypes.HANDLE),
 959         ]
 960
 961     kernel32 = ctypes.windll.kernel32
 962     LockFileEx = kernel32.LockFileEx
 963     LockFileEx.argtypes = [
 964         ctypes.wintypes.HANDLE,     # hFile
 965         ctypes.wintypes.DWORD,      # dwFlags
 966         ctypes.wintypes.DWORD,      # dwReserved
 967         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 968         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 969         ctypes.POINTER(OVERLAPPED)  # Overlapped
 970     ]
 971     LockFileEx.restype = ctypes.wintypes.BOOL
 972     UnlockFileEx = kernel32.UnlockFileEx
 973     UnlockFileEx.argtypes = [
 974         ctypes.wintypes.HANDLE,     # hFile
 975         ctypes.wintypes.DWORD,      # dwReserved
 976         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 977         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 978         ctypes.POINTER(OVERLAPPED)  # Overlapped
 979     ]
 980     UnlockFileEx.restype = ctypes.wintypes.BOOL
 981     whole_low = 0xffffffff
 982     whole_high = 0x7fffffff
 983
 984     def _lock_file(f, exclusive):
 985         overlapped = OVERLAPPED()
 986         overlapped.Offset = 0
 987         overlapped.OffsetHigh = 0
 988         overlapped.hEvent = 0
 989         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 990         handle = msvcrt.get_osfhandle(f.fileno())
 991         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 992                           whole_low, whole_high, f._lock_file_overlapped_p):
 993             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 994
 995     def _unlock_file(f):
 996         assert f._lock_file_overlapped_p
 997         handle = msvcrt.get_osfhandle(f.fileno())
 998         if not UnlockFileEx(handle, 0,
 999                             whole_low, whole_high, f._lock_file_overlapped_p):
1000             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1001
1002 else:
1003     import fcntl
1004
1005     def _lock_file(f, exclusive):
1006         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1007
1008     def _unlock_file(f):
1009         fcntl.flock(f, fcntl.LOCK_UN)
1010
1011
1012 class locked_file(object):
1013     def __init__(self, filename, mode, encoding=None):
1014         assert mode in ['r', 'a', 'w']
1015         self.f = io.open(filename, mode, encoding=encoding)
1016         self.mode = mode
1017
1018     def __enter__(self):
1019         exclusive = self.mode != 'r'
1020         try:
1021             _lock_file(self.f, exclusive)
1022         except IOError:
1023             self.f.close()
1024             raise
1025         return self
1026
1027     def __exit__(self, etype, value, traceback):
1028         try:
1029             _unlock_file(self.f)
1030         finally:
1031             self.f.close()
1032
1033     def __iter__(self):
1034         return iter(self.f)
1035
1036     def write(self, *args):
1037         return self.f.write(*args)
1038
1039     def read(self, *args):
1040         return self.f.read(*args)
1041
1042
1043 def get_filesystem_encoding():
1044     encoding = sys.getfilesystemencoding()
1045     return encoding if encoding is not None else 'utf-8'
1046
1047
1048 def shell_quote(args):
1049     quoted_args = []
1050     encoding = get_filesystem_encoding()
1051     for a in args:
1052         if isinstance(a, bytes):
1053             # We may get a filename encoded with 'encodeFilename'
1054             a = a.decode(encoding)
1055         quoted_args.append(pipes.quote(a))
1056     return u' '.join(quoted_args)
1057
1058
1059 def takewhile_inclusive(pred, seq):
1060     """ Like itertools.takewhile, but include the latest evaluated element
1061         (the first element so that Not pred(e)) """
1062     for e in seq:
1063         yield e
1064         if not pred(e):
1065             return
1066
1067
1068 def smuggle_url(url, data):
1069     """ Pass additional data in a URL for internal use. """
1070
1071     sdata = compat_urllib_parse.urlencode(
1072         {u'__youtubedl_smuggle': json.dumps(data)})
1073     return url + u'#' + sdata
1074
1075
1076 def unsmuggle_url(smug_url, default=None):
1077     if not '#__youtubedl_smuggle' in smug_url:
1078         return smug_url, default
1079     url, _, sdata = smug_url.rpartition(u'#')
1080     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1081     data = json.loads(jsond)
1082     return url, data
1083
1084
1085 def format_bytes(bytes):
1086     if bytes is None:
1087         return u'N/A'
1088     if type(bytes) is str:
1089         bytes = float(bytes)
1090     if bytes == 0.0:
1091         exponent = 0
1092     else:
1093         exponent = int(math.log(bytes, 1024.0))
1094     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1095     converted = float(bytes) / float(1024 ** exponent)
1096     return u'%.2f%s' % (converted, suffix)
1097
1098
1099 def get_term_width():
1100     columns = compat_getenv('COLUMNS', None)
1101     if columns:
1102         return int(columns)
1103
1104     try:
1105         sp = subprocess.Popen(
1106             ['stty', 'size'],
1107             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1108         out, err = sp.communicate()
1109         return int(out.split()[1])
1110     except:
1111         pass
1112     return None
1113
1114
1115 def month_by_name(name):
1116     """ Return the number of a month by (locale-independently) English name """
1117
1118     ENGLISH_NAMES = [
1119         u'January', u'February', u'March', u'April', u'May', u'June',
1120         u'July', u'August', u'September', u'October', u'November', u'December']
1121     try:
1122         return ENGLISH_NAMES.index(name) + 1
1123     except ValueError:
1124         return None
1125
1126
1127 def fix_xml_ampersands(xml_str):
1128     """Replace all the '&' by '&amp;' in XML"""
1129     return re.sub(
1130         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1131         u'&amp;',
1132         xml_str)
1133
1134
1135 def setproctitle(title):
1136     assert isinstance(title, compat_str)
1137     try:
1138         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1139     except OSError:
1140         return
1141     title_bytes = title.encode('utf-8')
1142     buf = ctypes.create_string_buffer(len(title_bytes))
1143     buf.value = title_bytes
1144     try:
1145         libc.prctl(15, buf, 0, 0, 0)
1146     except AttributeError:
1147         return  # Strange libc, just skip this
1148
1149
1150 def remove_start(s, start):
1151     if s.startswith(start):
1152         return s[len(start):]
1153     return s
1154
1155
1156 def remove_end(s, end):
1157     if s.endswith(end):
1158         return s[:-len(end)]
1159     return s
1160
1161
1162 def url_basename(url):
1163     path = compat_urlparse.urlparse(url).path
1164     return path.strip(u'/').split(u'/')[-1]
1165
1166
1167 class HEADRequest(compat_urllib_request.Request):
1168     def get_method(self):
1169         return "HEAD"
1170
1171
1172 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1173     if get_attr:
1174         if v is not None:
1175             v = getattr(v, get_attr, None)
1176     if v == '':
1177         v = None
1178     return default if v is None else (int(v) * invscale // scale)
1179
1180
1181 def str_or_none(v, default=None):
1182     return default if v is None else compat_str(v)
1183
1184
1185 def str_to_int(int_str):
1186     """ A more relaxed version of int_or_none """
1187     if int_str is None:
1188         return None
1189     int_str = re.sub(r'[,\.\+]', u'', int_str)
1190     return int(int_str)
1191
1192
1193 def float_or_none(v, scale=1, invscale=1, default=None):
1194     return default if v is None else (float(v) * invscale / scale)
1195
1196
1197 def parse_duration(s):
1198     if s is None:
1199         return None
1200
1201     s = s.strip()
1202
1203     m = re.match(
1204         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1205     if not m:
1206         return None
1207     res = int(m.group('secs'))
1208     if m.group('mins'):
1209         res += int(m.group('mins')) * 60
1210         if m.group('hours'):
1211             res += int(m.group('hours')) * 60 * 60
1212     if m.group('ms'):
1213         res += float(m.group('ms'))
1214     return res
1215
1216
1217 def prepend_extension(filename, ext):
1218     name, real_ext = os.path.splitext(filename)
1219     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1220
1221
1222 def check_executable(exe, args=[]):
1223     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1224     args can be a list of arguments for a short output (like -version) """
1225     try:
1226         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1227     except OSError:
1228         return False
1229     return exe
1230
1231
1232 def get_exe_version(exe, args=['--version'],
1233                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1234                     unrecognized=u'present'):
1235     """ Returns the version of the specified executable,
1236     or False if the executable is not present """
1237     try:
1238         out, err = subprocess.Popen(
1239             [exe] + args,
1240             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1241     except OSError:
1242         return False
1243     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1244     m = re.search(version_re, firstline)
1245     if m:
1246         return m.group(1)
1247     else:
1248         return unrecognized
1249
1250
1251 class PagedList(object):
1252     def __len__(self):
1253         # This is only useful for tests
1254         return len(self.getslice())
1255
1256
1257 class OnDemandPagedList(PagedList):
1258     def __init__(self, pagefunc, pagesize):
1259         self._pagefunc = pagefunc
1260         self._pagesize = pagesize
1261
1262     def getslice(self, start=0, end=None):
1263         res = []
1264         for pagenum in itertools.count(start // self._pagesize):
1265             firstid = pagenum * self._pagesize
1266             nextfirstid = pagenum * self._pagesize + self._pagesize
1267             if start >= nextfirstid:
1268                 continue
1269
1270             page_results = list(self._pagefunc(pagenum))
1271
1272             startv = (
1273                 start % self._pagesize
1274                 if firstid <= start < nextfirstid
1275                 else 0)
1276
1277             endv = (
1278                 ((end - 1) % self._pagesize) + 1
1279                 if (end is not None and firstid <= end <= nextfirstid)
1280                 else None)
1281
1282             if startv != 0 or endv is not None:
1283                 page_results = page_results[startv:endv]
1284             res.extend(page_results)
1285
1286             # A little optimization - if current page is not "full", ie. does
1287             # not contain page_size videos then we can assume that this page
1288             # is the last one - there are no more ids on further pages -
1289             # i.e. no need to query again.
1290             if len(page_results) + startv < self._pagesize:
1291                 break
1292
1293             # If we got the whole page, but the next page is not interesting,
1294             # break out early as well
1295             if end == nextfirstid:
1296                 break
1297         return res
1298
1299
1300 class InAdvancePagedList(PagedList):
1301     def __init__(self, pagefunc, pagecount, pagesize):
1302         self._pagefunc = pagefunc
1303         self._pagecount = pagecount
1304         self._pagesize = pagesize
1305
1306     def getslice(self, start=0, end=None):
1307         res = []
1308         start_page = start // self._pagesize
1309         end_page = (
1310             self._pagecount if end is None else (end // self._pagesize + 1))
1311         skip_elems = start - start_page * self._pagesize
1312         only_more = None if end is None else end - start
1313         for pagenum in range(start_page, end_page):
1314             page = list(self._pagefunc(pagenum))
1315             if skip_elems:
1316                 page = page[skip_elems:]
1317                 skip_elems = None
1318             if only_more is not None:
1319                 if len(page) < only_more:
1320                     only_more -= len(page)
1321                 else:
1322                     page = page[:only_more]
1323                     res.extend(page)
1324                     break
1325             res.extend(page)
1326         return res
1327
1328
1329 def uppercase_escape(s):
1330     unicode_escape = codecs.getdecoder('unicode_escape')
1331     return re.sub(
1332         r'\\U[0-9a-fA-F]{8}',
1333         lambda m: unicode_escape(m.group(0))[0],
1334         s)
1335
1336
1337 def escape_rfc3986(s):
1338     """Escape non-ASCII characters as suggested by RFC 3986"""
1339     if sys.version_info < (3, 0) and isinstance(s, unicode):
1340         s = s.encode('utf-8')
1341     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1342
1343
1344 def escape_url(url):
1345     """Escape URL as suggested by RFC 3986"""
1346     url_parsed = compat_urllib_parse_urlparse(url)
1347     return url_parsed._replace(
1348         path=escape_rfc3986(url_parsed.path),
1349         params=escape_rfc3986(url_parsed.params),
1350         query=escape_rfc3986(url_parsed.query),
1351         fragment=escape_rfc3986(url_parsed.fragment)
1352     ).geturl()
1353
1354 try:
1355     struct.pack(u'!I', 0)
1356 except TypeError:
1357     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1358     def struct_pack(spec, *args):
1359         if isinstance(spec, compat_str):
1360             spec = spec.encode('ascii')
1361         return struct.pack(spec, *args)
1362
1363     def struct_unpack(spec, *args):
1364         if isinstance(spec, compat_str):
1365             spec = spec.encode('ascii')
1366         return struct.unpack(spec, *args)
1367 else:
1368     struct_pack = struct.pack
1369     struct_unpack = struct.unpack
1370
1371
1372 def read_batch_urls(batch_fd):
1373     def fixup(url):
1374         if not isinstance(url, compat_str):
1375             url = url.decode('utf-8', 'replace')
1376         BOM_UTF8 = u'\xef\xbb\xbf'
1377         if url.startswith(BOM_UTF8):
1378             url = url[len(BOM_UTF8):]
1379         url = url.strip()
1380         if url.startswith(('#', ';', ']')):
1381             return False
1382         return url
1383
1384     with contextlib.closing(batch_fd) as fd:
1385         return [url for url in map(fixup, fd) if url]
1386
1387
1388 def urlencode_postdata(*args, **kargs):
1389     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1390
1391
1392 try:
1393     etree_iter = xml.etree.ElementTree.Element.iter
1394 except AttributeError:  # Python <=2.6
1395     etree_iter = lambda n: n.findall('.//*')
1396
1397
1398 def parse_xml(s):
1399     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1400         def doctype(self, name, pubid, system):
1401             pass  # Ignore doctypes
1402
1403     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1404     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1405     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1406     # Fix up XML parser in Python 2.x
1407     if sys.version_info < (3, 0):
1408         for n in etree_iter(tree):
1409             if n.text is not None:
1410                 if not isinstance(n.text, compat_str):
1411                     n.text = n.text.decode('utf-8')
1412     return tree
1413
1414
1415 US_RATINGS = {
1416     'G': 0,
1417     'PG': 10,
1418     'PG-13': 13,
1419     'R': 16,
1420     'NC': 18,
1421 }
1422
1423
1424 def parse_age_limit(s):
1425     if s is None:
1426         return None
1427     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1428     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1429
1430
1431 def strip_jsonp(code):
1432     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1433
1434
1435 def js_to_json(code):
1436     def fix_kv(m):
1437         v = m.group(0)
1438         if v in ('true', 'false', 'null'):
1439             return v
1440         if v.startswith('"'):
1441             return v
1442         if v.startswith("'"):
1443             v = v[1:-1]
1444             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1445                 '\\\\': '\\\\',
1446                 "\\'": "'",
1447                 '"': '\\"',
1448             }[m.group(0)], v)
1449         return '"%s"' % v
1450
1451     res = re.sub(r'''(?x)
1452         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1453         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1454         [a-zA-Z_][a-zA-Z_0-9]*
1455         ''', fix_kv, code)
1456     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1457     return res
1458
1459
1460 def qualities(quality_ids):
1461     """ Get a numeric quality value out of a list of possible values """
1462     def q(qid):
1463         try:
1464             return quality_ids.index(qid)
1465         except ValueError:
1466             return -1
1467     return q
1468
1469
1470 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1471
1472
1473 def limit_length(s, length):
1474     """ Add ellipses to overly long strings """
1475     if s is None:
1476         return None
1477     ELLIPSES = '...'
1478     if len(s) > length:
1479         return s[:length - len(ELLIPSES)] + ELLIPSES
1480     return s
1481
1482
1483 def version_tuple(v):
1484     return [int(e) for e in v.split('.')]
1485
1486
1487 def is_outdated_version(version, limit, assume_new=True):
1488     if not version:
1489         return not assume_new
1490     try:
1491         return version_tuple(version) < version_tuple(limit)
1492     except ValueError:
1493         return not assume_new