youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import unicode_literals
   5
   6 import calendar
   7 import codecs
   8 import contextlib
   9 import ctypes
  10 import datetime
  11 import email.utils
  12 import errno
  13 import gzip
  14 import itertools
  15 import io
  16 import json
  17 import locale
  18 import math
  19 import os
  20 import pipes
  21 import platform
  22 import re
  23 import ssl
  24 import socket
  25 import struct
  26 import subprocess
  27 import sys
  28 import tempfile
  29 import traceback
  30 import xml.etree.ElementTree
  31 import zlib
  32
  33 from .compat import (
  34     compat_chr,
  35     compat_getenv,
  36     compat_html_entities,
  37     compat_html_parser,
  38     compat_parse_qs,
  39     compat_str,
  40     compat_urllib_error,
  41     compat_urllib_parse,
  42     compat_urllib_parse_urlparse,
  43     compat_urllib_request,
  44     compat_urlparse,
  45 )
  46
  47
  48 # This is not clearly defined otherwise
  49 compiled_regex_type = type(re.compile(''))
  50
  51 std_headers = {
  52     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
  53     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  54     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  55     'Accept-Encoding': 'gzip, deflate',
  56     'Accept-Language': 'en-us,en;q=0.5',
  57 }
  58
  59 def preferredencoding():
  60     """Get preferred encoding.
  61
  62     Returns the best encoding scheme for the system, based on
  63     locale.getpreferredencoding() and some further tweaks.
  64     """
  65     try:
  66         pref = locale.getpreferredencoding()
  67         u'TEST'.encode(pref)
  68     except:
  69         pref = 'UTF-8'
  70
  71     return pref
  72
  73
  74 def write_json_file(obj, fn):
  75     """ Encode obj as JSON and write it to fn, atomically """
  76
  77     args = {
  78         'suffix': '.tmp',
  79         'prefix': os.path.basename(fn) + '.',
  80         'dir': os.path.dirname(fn),
  81         'delete': False,
  82     }
  83
  84     # In Python 2.x, json.dump expects a bytestream.
  85     # In Python 3.x, it writes to a character stream
  86     if sys.version_info < (3, 0):
  87         args['mode'] = 'wb'
  88     else:
  89         args.update({
  90             'mode': 'w',
  91             'encoding': 'utf-8',
  92         })
  93
  94     tf = tempfile.NamedTemporaryFile(**args)
  95
  96     try:
  97         with tf:
  98             json.dump(obj, tf)
  99         os.rename(tf.name, fn)
 100     except:
 101         try:
 102             os.remove(tf.name)
 103         except OSError:
 104             pass
 105         raise
 106
 107
 108 if sys.version_info >= (2, 7):
 109     def find_xpath_attr(node, xpath, key, val):
 110         """ Find the xpath xpath[@key=val] """
 111         assert re.match(r'^[a-zA-Z-]+$', key)
 112         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 113         expr = xpath + u"[@%s='%s']" % (key, val)
 114         return node.find(expr)
 115 else:
 116     def find_xpath_attr(node, xpath, key, val):
 117         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 118         # .//node does not match if a node is a direct child of . !
 119         if isinstance(xpath, unicode):
 120             xpath = xpath.encode('ascii')
 121
 122         for f in node.findall(xpath):
 123             if f.attrib.get(key) == val:
 124                 return f
 125         return None
 126
 127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 128 # the namespace parameter
 129 def xpath_with_ns(path, ns_map):
 130     components = [c.split(':') for c in path.split('/')]
 131     replaced = []
 132     for c in components:
 133         if len(c) == 1:
 134             replaced.append(c[0])
 135         else:
 136             ns, tag = c
 137             replaced.append('{%s}%s' % (ns_map[ns], tag))
 138     return '/'.join(replaced)
 139
 140
 141 def xpath_text(node, xpath, name=None, fatal=False):
 142     if sys.version_info < (2, 7):  # Crazy 2.6
 143         xpath = xpath.encode('ascii')
 144
 145     n = node.find(xpath)
 146     if n is None:
 147         if fatal:
 148             name = xpath if name is None else name
 149             raise ExtractorError('Could not find XML element %s' % name)
 150         else:
 151             return None
 152     return n.text
 153
 154
 155 if sys.version_info < (2, 7):
 156     compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 157
 158 class BaseHTMLParser(compat_html_parser.HTMLParser):
 159     def __init(self):
 160         compat_html_parser.HTMLParser.__init__(self)
 161         self.html = None
 162
 163     def loads(self, html):
 164         self.html = html
 165         self.feed(html)
 166         self.close()
 167
 168 class AttrParser(BaseHTMLParser):
 169     """Modified HTMLParser that isolates a tag with the specified attribute"""
 170     def __init__(self, attribute, value):
 171         self.attribute = attribute
 172         self.value = value
 173         self.result = None
 174         self.started = False
 175         self.depth = {}
 176         self.watch_startpos = False
 177         self.error_count = 0
 178         BaseHTMLParser.__init__(self)
 179
 180     def error(self, message):
 181         if self.error_count > 10 or self.started:
 182             raise compat_html_parser.HTMLParseError(message, self.getpos())
 183         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 184         self.error_count += 1
 185         self.goahead(1)
 186
 187     def handle_starttag(self, tag, attrs):
 188         attrs = dict(attrs)
 189         if self.started:
 190             self.find_startpos(None)
 191         if self.attribute in attrs and attrs[self.attribute] == self.value:
 192             self.result = [tag]
 193             self.started = True
 194             self.watch_startpos = True
 195         if self.started:
 196             if not tag in self.depth: self.depth[tag] = 0
 197             self.depth[tag] += 1
 198
 199     def handle_endtag(self, tag):
 200         if self.started:
 201             if tag in self.depth: self.depth[tag] -= 1
 202             if self.depth[self.result[0]] == 0:
 203                 self.started = False
 204                 self.result.append(self.getpos())
 205
 206     def find_startpos(self, x):
 207         """Needed to put the start position of the result (self.result[1])
 208         after the opening tag with the requested id"""
 209         if self.watch_startpos:
 210             self.watch_startpos = False
 211             self.result.append(self.getpos())
 212     handle_entityref = handle_charref = handle_data = handle_comment = \
 213     handle_decl = handle_pi = unknown_decl = find_startpos
 214
 215     def get_result(self):
 216         if self.result is None:
 217             return None
 218         if len(self.result) != 3:
 219             return None
 220         lines = self.html.split('\n')
 221         lines = lines[self.result[1][0]-1:self.result[2][0]]
 222         lines[0] = lines[0][self.result[1][1]:]
 223         if len(lines) == 1:
 224             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 225         lines[-1] = lines[-1][:self.result[2][1]]
 226         return '\n'.join(lines).strip()
 227 # Hack for https://github.com/rg3/youtube-dl/issues/662
 228 if sys.version_info < (2, 7, 3):
 229     AttrParser.parse_endtag = (lambda self, i:
 230         i + len("</scr'+'ipt>")
 231         if self.rawdata[i:].startswith("</scr'+'ipt>")
 232         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 233
 234
 235 def get_element_by_id(id, html):
 236     """Return the content of the tag with the specified ID in the passed HTML document"""
 237     return get_element_by_attribute("id", id, html)
 238
 239
 240 def get_element_by_attribute(attribute, value, html):
 241     """Return the content of the tag with the specified attribute in the passed HTML document"""
 242     parser = AttrParser(attribute, value)
 243     try:
 244         parser.loads(html)
 245     except compat_html_parser.HTMLParseError:
 246         pass
 247     return parser.get_result()
 248
 249 class MetaParser(BaseHTMLParser):
 250     """
 251     Modified HTMLParser that isolates a meta tag with the specified name
 252     attribute.
 253     """
 254     def __init__(self, name):
 255         BaseHTMLParser.__init__(self)
 256         self.name = name
 257         self.content = None
 258         self.result = None
 259
 260     def handle_starttag(self, tag, attrs):
 261         if tag != 'meta':
 262             return
 263         attrs = dict(attrs)
 264         if attrs.get('name') == self.name:
 265             self.result = attrs.get('content')
 266
 267     def get_result(self):
 268         return self.result
 269
 270
 271
 272 def clean_html(html):
 273     """Clean an HTML snippet into a readable string"""
 274     # Newline vs <br />
 275     html = html.replace('\n', ' ')
 276     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 277     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 278     # Strip html tags
 279     html = re.sub('<.*?>', '', html)
 280     # Replace html entities
 281     html = unescapeHTML(html)
 282     return html.strip()
 283
 284
 285 def sanitize_open(filename, open_mode):
 286     """Try to open the given filename, and slightly tweak it if this fails.
 287
 288     Attempts to open the given filename. If this fails, it tries to change
 289     the filename slightly, step by step, until it's either able to open it
 290     or it fails and raises a final exception, like the standard open()
 291     function.
 292
 293     It returns the tuple (stream, definitive_file_name).
 294     """
 295     try:
 296         if filename == u'-':
 297             if sys.platform == 'win32':
 298                 import msvcrt
 299                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 300             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 301         stream = open(encodeFilename(filename), open_mode)
 302         return (stream, filename)
 303     except (IOError, OSError) as err:
 304         if err.errno in (errno.EACCES,):
 305             raise
 306
 307         # In case of error, try to remove win32 forbidden chars
 308         alt_filename = os.path.join(
 309                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 310                         for path_part in os.path.split(filename)
 311                        )
 312         if alt_filename == filename:
 313             raise
 314         else:
 315             # An exception here should be caught in the caller
 316             stream = open(encodeFilename(filename), open_mode)
 317             return (stream, alt_filename)
 318
 319
 320 def timeconvert(timestr):
 321     """Convert RFC 2822 defined time string into system timestamp"""
 322     timestamp = None
 323     timetuple = email.utils.parsedate_tz(timestr)
 324     if timetuple is not None:
 325         timestamp = email.utils.mktime_tz(timetuple)
 326     return timestamp
 327
 328 def sanitize_filename(s, restricted=False, is_id=False):
 329     """Sanitizes a string so it could be used as part of a filename.
 330     If restricted is set, use a stricter subset of allowed characters.
 331     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 332     """
 333     def replace_insane(char):
 334         if char == '?' or ord(char) < 32 or ord(char) == 127:
 335             return ''
 336         elif char == '"':
 337             return '' if restricted else '\''
 338         elif char == ':':
 339             return '_-' if restricted else ' -'
 340         elif char in '\\/|*<>':
 341             return '_'
 342         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 343             return '_'
 344         if restricted and ord(char) > 127:
 345             return '_'
 346         return char
 347
 348     result = u''.join(map(replace_insane, s))
 349     if not is_id:
 350         while '__' in result:
 351             result = result.replace('__', '_')
 352         result = result.strip('_')
 353         # Common case of "Foreign band name - English song title"
 354         if restricted and result.startswith('-_'):
 355             result = result[2:]
 356         if not result:
 357             result = '_'
 358     return result
 359
 360 def orderedSet(iterable):
 361     """ Remove all duplicates from the input iterable """
 362     res = []
 363     for el in iterable:
 364         if el not in res:
 365             res.append(el)
 366     return res
 367
 368
 369 def _htmlentity_transform(entity):
 370     """Transforms an HTML entity to a character."""
 371     # Known non-numeric HTML entity
 372     if entity in compat_html_entities.name2codepoint:
 373         return compat_chr(compat_html_entities.name2codepoint[entity])
 374
 375     mobj = re.match(r'#(x?[0-9]+)', entity)
 376     if mobj is not None:
 377         numstr = mobj.group(1)
 378         if numstr.startswith(u'x'):
 379             base = 16
 380             numstr = u'0%s' % numstr
 381         else:
 382             base = 10
 383         return compat_chr(int(numstr, base))
 384
 385     # Unknown entity in name, return its literal representation
 386     return (u'&%s;' % entity)
 387
 388
 389 def unescapeHTML(s):
 390     if s is None:
 391         return None
 392     assert type(s) == compat_str
 393
 394     return re.sub(
 395         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 396
 397
 398 def encodeFilename(s, for_subprocess=False):
 399     """
 400     @param s The name of the file
 401     """
 402
 403     assert type(s) == compat_str
 404
 405     # Python 3 has a Unicode API
 406     if sys.version_info >= (3, 0):
 407         return s
 408
 409     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 410         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 411         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 412         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 413         if not for_subprocess:
 414             return s
 415         else:
 416             # For subprocess calls, encode with locale encoding
 417             # Refer to http://stackoverflow.com/a/9951851/35070
 418             encoding = preferredencoding()
 419     else:
 420         encoding = sys.getfilesystemencoding()
 421     if encoding is None:
 422         encoding = 'utf-8'
 423     return s.encode(encoding, 'ignore')
 424
 425
 426 def encodeArgument(s):
 427     if not isinstance(s, compat_str):
 428         # Legacy code that uses byte strings
 429         # Uncomment the following line after fixing all post processors
 430         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 431         s = s.decode('ascii')
 432     return encodeFilename(s, True)
 433
 434
 435 def decodeOption(optval):
 436     if optval is None:
 437         return optval
 438     if isinstance(optval, bytes):
 439         optval = optval.decode(preferredencoding())
 440
 441     assert isinstance(optval, compat_str)
 442     return optval
 443
 444 def formatSeconds(secs):
 445     if secs > 3600:
 446         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 447     elif secs > 60:
 448         return '%d:%02d' % (secs // 60, secs % 60)
 449     else:
 450         return '%d' % secs
 451
 452
 453 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 454     if sys.version_info < (3, 2):
 455         import httplib
 456
 457         class HTTPSConnectionV3(httplib.HTTPSConnection):
 458             def __init__(self, *args, **kwargs):
 459                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 460
 461             def connect(self):
 462                 sock = socket.create_connection((self.host, self.port), self.timeout)
 463                 if getattr(self, '_tunnel_host', False):
 464                     self.sock = sock
 465                     self._tunnel()
 466                 try:
 467                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 468                 except ssl.SSLError:
 469                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 470
 471         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 472             def https_open(self, req):
 473                 return self.do_open(HTTPSConnectionV3, req)
 474         return HTTPSHandlerV3(**kwargs)
 475     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 476         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 477         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 478         if opts_no_check_certificate:
 479             context.verify_mode = ssl.CERT_NONE
 480         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 481     else:  # Python < 3.4
 482         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 483         context.verify_mode = (ssl.CERT_NONE
 484                                if opts_no_check_certificate
 485                                else ssl.CERT_REQUIRED)
 486         context.set_default_verify_paths()
 487         try:
 488             context.load_default_certs()
 489         except AttributeError:
 490             pass  # Python < 3.4
 491         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 492
 493 class ExtractorError(Exception):
 494     """Error during info extraction."""
 495     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 496         """ tb, if given, is the original traceback (so that it can be printed out).
 497         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 498         """
 499
 500         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 501             expected = True
 502         if video_id is not None:
 503             msg = video_id + ': ' + msg
 504         if cause:
 505             msg += u' (caused by %r)' % cause
 506         if not expected:
 507             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 508         super(ExtractorError, self).__init__(msg)
 509
 510         self.traceback = tb
 511         self.exc_info = sys.exc_info()  # preserve original exception
 512         self.cause = cause
 513         self.video_id = video_id
 514
 515     def format_traceback(self):
 516         if self.traceback is None:
 517             return None
 518         return u''.join(traceback.format_tb(self.traceback))
 519
 520
 521 class RegexNotFoundError(ExtractorError):
 522     """Error when a regex didn't match"""
 523     pass
 524
 525
 526 class DownloadError(Exception):
 527     """Download Error exception.
 528
 529     This exception may be thrown by FileDownloader objects if they are not
 530     configured to continue on errors. They will contain the appropriate
 531     error message.
 532     """
 533     def __init__(self, msg, exc_info=None):
 534         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 535         super(DownloadError, self).__init__(msg)
 536         self.exc_info = exc_info
 537
 538
 539 class SameFileError(Exception):
 540     """Same File exception.
 541
 542     This exception will be thrown by FileDownloader objects if they detect
 543     multiple files would have to be downloaded to the same file on disk.
 544     """
 545     pass
 546
 547
 548 class PostProcessingError(Exception):
 549     """Post Processing exception.
 550
 551     This exception may be raised by PostProcessor's .run() method to
 552     indicate an error in the postprocessing task.
 553     """
 554     def __init__(self, msg):
 555         self.msg = msg
 556
 557 class MaxDownloadsReached(Exception):
 558     """ --max-downloads limit has been reached. """
 559     pass
 560
 561
 562 class UnavailableVideoError(Exception):
 563     """Unavailable Format exception.
 564
 565     This exception will be thrown when a video is requested
 566     in a format that is not available for that video.
 567     """
 568     pass
 569
 570
 571 class ContentTooShortError(Exception):
 572     """Content Too Short exception.
 573
 574     This exception may be raised by FileDownloader objects when a file they
 575     download is too small for what the server announced first, indicating
 576     the connection was probably interrupted.
 577     """
 578     # Both in bytes
 579     downloaded = None
 580     expected = None
 581
 582     def __init__(self, downloaded, expected):
 583         self.downloaded = downloaded
 584         self.expected = expected
 585
 586 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 587     """Handler for HTTP requests and responses.
 588
 589     This class, when installed with an OpenerDirector, automatically adds
 590     the standard headers to every HTTP request and handles gzipped and
 591     deflated responses from web servers. If compression is to be avoided in
 592     a particular request, the original request in the program code only has
 593     to include the HTTP header "Youtubedl-No-Compression", which will be
 594     removed before making the real request.
 595
 596     Part of this code was copied from:
 597
 598     http://techknack.net/python-urllib2-handlers/
 599
 600     Andrew Rowls, the author of that code, agreed to release it to the
 601     public domain.
 602     """
 603
 604     @staticmethod
 605     def deflate(data):
 606         try:
 607             return zlib.decompress(data, -zlib.MAX_WBITS)
 608         except zlib.error:
 609             return zlib.decompress(data)
 610
 611     @staticmethod
 612     def addinfourl_wrapper(stream, headers, url, code):
 613         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 614             return compat_urllib_request.addinfourl(stream, headers, url, code)
 615         ret = compat_urllib_request.addinfourl(stream, headers, url)
 616         ret.code = code
 617         return ret
 618
 619     def http_request(self, req):
 620         for h, v in std_headers.items():
 621             if h not in req.headers:
 622                 req.add_header(h, v)
 623         if 'Youtubedl-no-compression' in req.headers:
 624             if 'Accept-encoding' in req.headers:
 625                 del req.headers['Accept-encoding']
 626             del req.headers['Youtubedl-no-compression']
 627         if 'Youtubedl-user-agent' in req.headers:
 628             if 'User-agent' in req.headers:
 629                 del req.headers['User-agent']
 630             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 631             del req.headers['Youtubedl-user-agent']
 632
 633         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 634             # Python 2.6 is brain-dead when it comes to fragments
 635             req._Request__original = req._Request__original.partition('#')[0]
 636             req._Request__r_type = req._Request__r_type.partition('#')[0]
 637
 638         return req
 639
 640     def http_response(self, req, resp):
 641         old_resp = resp
 642         # gzip
 643         if resp.headers.get('Content-encoding', '') == 'gzip':
 644             content = resp.read()
 645             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 646             try:
 647                 uncompressed = io.BytesIO(gz.read())
 648             except IOError as original_ioerror:
 649                 # There may be junk add the end of the file
 650                 # See http://stackoverflow.com/q/4928560/35070 for details
 651                 for i in range(1, 1024):
 652                     try:
 653                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 654                         uncompressed = io.BytesIO(gz.read())
 655                     except IOError:
 656                         continue
 657                     break
 658                 else:
 659                     raise original_ioerror
 660             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 661             resp.msg = old_resp.msg
 662         # deflate
 663         if resp.headers.get('Content-encoding', '') == 'deflate':
 664             gz = io.BytesIO(self.deflate(resp.read()))
 665             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 666             resp.msg = old_resp.msg
 667         return resp
 668
 669     https_request = http_request
 670     https_response = http_response
 671
 672
 673 def parse_iso8601(date_str, delimiter='T'):
 674     """ Return a UNIX timestamp from the given date """
 675
 676     if date_str is None:
 677         return None
 678
 679     m = re.search(
 680         r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
 681         date_str)
 682     if not m:
 683         timezone = datetime.timedelta()
 684     else:
 685         date_str = date_str[:-len(m.group(0))]
 686         if not m.group('sign'):
 687             timezone = datetime.timedelta()
 688         else:
 689             sign = 1 if m.group('sign') == '+' else -1
 690             timezone = datetime.timedelta(
 691                 hours=sign * int(m.group('hours')),
 692                 minutes=sign * int(m.group('minutes')))
 693     date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 694     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 695     return calendar.timegm(dt.timetuple())
 696
 697
 698 def unified_strdate(date_str):
 699     """Return a string with the date in the format YYYYMMDD"""
 700
 701     if date_str is None:
 702         return None
 703
 704     upload_date = None
 705     #Replace commas
 706     date_str = date_str.replace(',', ' ')
 707     # %z (UTC offset) is only supported in python>=3.2
 708     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 709     format_expressions = [
 710         '%d %B %Y',
 711         '%d %b %Y',
 712         '%B %d %Y',
 713         '%b %d %Y',
 714         '%b %dst %Y %I:%M%p',
 715         '%b %dnd %Y %I:%M%p',
 716         '%b %dth %Y %I:%M%p',
 717         '%Y-%m-%d',
 718         '%Y/%m/%d',
 719         '%d.%m.%Y',
 720         '%d/%m/%Y',
 721         '%d/%m/%y',
 722         '%Y/%m/%d %H:%M:%S',
 723         '%d/%m/%Y %H:%M:%S',
 724         '%Y-%m-%d %H:%M:%S',
 725         '%Y-%m-%d %H:%M:%S.%f',
 726         '%d.%m.%Y %H:%M',
 727         '%d.%m.%Y %H.%M',
 728         '%Y-%m-%dT%H:%M:%SZ',
 729         '%Y-%m-%dT%H:%M:%S.%fZ',
 730         '%Y-%m-%dT%H:%M:%S.%f0Z',
 731         '%Y-%m-%dT%H:%M:%S',
 732         '%Y-%m-%dT%H:%M:%S.%f',
 733         '%Y-%m-%dT%H:%M',
 734     ]
 735     for expression in format_expressions:
 736         try:
 737             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 738         except ValueError:
 739             pass
 740     if upload_date is None:
 741         timetuple = email.utils.parsedate_tz(date_str)
 742         if timetuple:
 743             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 744     return upload_date
 745
 746 def determine_ext(url, default_ext=u'unknown_video'):
 747     if url is None:
 748         return default_ext
 749     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 750     if re.match(r'^[A-Za-z0-9]+$', guess):
 751         return guess
 752     else:
 753         return default_ext
 754
 755 def subtitles_filename(filename, sub_lang, sub_format):
 756     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 757
 758 def date_from_str(date_str):
 759     """
 760     Return a datetime object from a string in the format YYYYMMDD or
 761     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 762     today = datetime.date.today()
 763     if date_str == 'now'or date_str == 'today':
 764         return today
 765     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 766     if match is not None:
 767         sign = match.group('sign')
 768         time = int(match.group('time'))
 769         if sign == '-':
 770             time = -time
 771         unit = match.group('unit')
 772         #A bad aproximation?
 773         if unit == 'month':
 774             unit = 'day'
 775             time *= 30
 776         elif unit == 'year':
 777             unit = 'day'
 778             time *= 365
 779         unit += 's'
 780         delta = datetime.timedelta(**{unit: time})
 781         return today + delta
 782     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 783
 784 def hyphenate_date(date_str):
 785     """
 786     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 787     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 788     if match is not None:
 789         return '-'.join(match.groups())
 790     else:
 791         return date_str
 792
 793 class DateRange(object):
 794     """Represents a time interval between two dates"""
 795     def __init__(self, start=None, end=None):
 796         """start and end must be strings in the format accepted by date"""
 797         if start is not None:
 798             self.start = date_from_str(start)
 799         else:
 800             self.start = datetime.datetime.min.date()
 801         if end is not None:
 802             self.end = date_from_str(end)
 803         else:
 804             self.end = datetime.datetime.max.date()
 805         if self.start > self.end:
 806             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 807     @classmethod
 808     def day(cls, day):
 809         """Returns a range that only contains the given day"""
 810         return cls(day,day)
 811     def __contains__(self, date):
 812         """Check if the date is in the range"""
 813         if not isinstance(date, datetime.date):
 814             date = date_from_str(date)
 815         return self.start <= date <= self.end
 816     def __str__(self):
 817         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 818
 819
 820 def platform_name():
 821     """ Returns the platform name as a compat_str """
 822     res = platform.platform()
 823     if isinstance(res, bytes):
 824         res = res.decode(preferredencoding())
 825
 826     assert isinstance(res, compat_str)
 827     return res
 828
 829
 830 def _windows_write_string(s, out):
 831     """ Returns True if the string was written using special methods,
 832     False if it has yet to be written out."""
 833     # Adapted from http://stackoverflow.com/a/3259271/35070
 834
 835     import ctypes
 836     import ctypes.wintypes
 837
 838     WIN_OUTPUT_IDS = {
 839         1: -11,
 840         2: -12,
 841     }
 842
 843     try:
 844         fileno = out.fileno()
 845     except AttributeError:
 846         # If the output stream doesn't have a fileno, it's virtual
 847         return False
 848     if fileno not in WIN_OUTPUT_IDS:
 849         return False
 850
 851     GetStdHandle = ctypes.WINFUNCTYPE(
 852         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
 853         ("GetStdHandle", ctypes.windll.kernel32))
 854     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
 855
 856     WriteConsoleW = ctypes.WINFUNCTYPE(
 857         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
 858         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
 859         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
 860     written = ctypes.wintypes.DWORD(0)
 861
 862     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
 863     FILE_TYPE_CHAR = 0x0002
 864     FILE_TYPE_REMOTE = 0x8000
 865     GetConsoleMode = ctypes.WINFUNCTYPE(
 866         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
 867         ctypes.POINTER(ctypes.wintypes.DWORD))(
 868         ("GetConsoleMode", ctypes.windll.kernel32))
 869     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
 870
 871     def not_a_console(handle):
 872         if handle == INVALID_HANDLE_VALUE or handle is None:
 873             return True
 874         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
 875                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
 876
 877     if not_a_console(h):
 878         return False
 879
 880     def next_nonbmp_pos(s):
 881         try:
 882             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
 883         except StopIteration:
 884             return len(s)
 885
 886     while s:
 887         count = min(next_nonbmp_pos(s), 1024)
 888
 889         ret = WriteConsoleW(
 890             h, s, count if count else 2, ctypes.byref(written), None)
 891         if ret == 0:
 892             raise OSError('Failed to write string')
 893         if not count:  # We just wrote a non-BMP character
 894             assert written.value == 2
 895             s = s[1:]
 896         else:
 897             assert written.value > 0
 898             s = s[written.value:]
 899     return True
 900
 901
 902 def write_string(s, out=None, encoding=None):
 903     if out is None:
 904         out = sys.stderr
 905     assert type(s) == compat_str
 906
 907     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
 908         if _windows_write_string(s, out):
 909             return
 910
 911     if ('b' in getattr(out, 'mode', '') or
 912             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 913         byt = s.encode(encoding or preferredencoding(), 'ignore')
 914         out.write(byt)
 915     elif hasattr(out, 'buffer'):
 916         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
 917         byt = s.encode(enc, 'ignore')
 918         out.buffer.write(byt)
 919     else:
 920         out.write(s)
 921     out.flush()
 922
 923
 924 def bytes_to_intlist(bs):
 925     if not bs:
 926         return []
 927     if isinstance(bs[0], int):  # Python 3
 928         return list(bs)
 929     else:
 930         return [ord(c) for c in bs]
 931
 932
 933 def intlist_to_bytes(xs):
 934     if not xs:
 935         return b''
 936     if isinstance(chr(0), bytes):  # Python 2
 937         return ''.join([chr(x) for x in xs])
 938     else:
 939         return bytes(xs)
 940
 941
 942 # Cross-platform file locking
 943 if sys.platform == 'win32':
 944     import ctypes.wintypes
 945     import msvcrt
 946
 947     class OVERLAPPED(ctypes.Structure):
 948         _fields_ = [
 949             ('Internal', ctypes.wintypes.LPVOID),
 950             ('InternalHigh', ctypes.wintypes.LPVOID),
 951             ('Offset', ctypes.wintypes.DWORD),
 952             ('OffsetHigh', ctypes.wintypes.DWORD),
 953             ('hEvent', ctypes.wintypes.HANDLE),
 954         ]
 955
 956     kernel32 = ctypes.windll.kernel32
 957     LockFileEx = kernel32.LockFileEx
 958     LockFileEx.argtypes = [
 959         ctypes.wintypes.HANDLE,     # hFile
 960         ctypes.wintypes.DWORD,      # dwFlags
 961         ctypes.wintypes.DWORD,      # dwReserved
 962         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 963         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 964         ctypes.POINTER(OVERLAPPED)  # Overlapped
 965     ]
 966     LockFileEx.restype = ctypes.wintypes.BOOL
 967     UnlockFileEx = kernel32.UnlockFileEx
 968     UnlockFileEx.argtypes = [
 969         ctypes.wintypes.HANDLE,     # hFile
 970         ctypes.wintypes.DWORD,      # dwReserved
 971         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 972         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 973         ctypes.POINTER(OVERLAPPED)  # Overlapped
 974     ]
 975     UnlockFileEx.restype = ctypes.wintypes.BOOL
 976     whole_low = 0xffffffff
 977     whole_high = 0x7fffffff
 978
 979     def _lock_file(f, exclusive):
 980         overlapped = OVERLAPPED()
 981         overlapped.Offset = 0
 982         overlapped.OffsetHigh = 0
 983         overlapped.hEvent = 0
 984         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 985         handle = msvcrt.get_osfhandle(f.fileno())
 986         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 987                           whole_low, whole_high, f._lock_file_overlapped_p):
 988             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 989
 990     def _unlock_file(f):
 991         assert f._lock_file_overlapped_p
 992         handle = msvcrt.get_osfhandle(f.fileno())
 993         if not UnlockFileEx(handle, 0,
 994                             whole_low, whole_high, f._lock_file_overlapped_p):
 995             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 996
 997 else:
 998     import fcntl
 999
1000     def _lock_file(f, exclusive):
1001         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1002
1003     def _unlock_file(f):
1004         fcntl.flock(f, fcntl.LOCK_UN)
1005
1006
1007 class locked_file(object):
1008     def __init__(self, filename, mode, encoding=None):
1009         assert mode in ['r', 'a', 'w']
1010         self.f = io.open(filename, mode, encoding=encoding)
1011         self.mode = mode
1012
1013     def __enter__(self):
1014         exclusive = self.mode != 'r'
1015         try:
1016             _lock_file(self.f, exclusive)
1017         except IOError:
1018             self.f.close()
1019             raise
1020         return self
1021
1022     def __exit__(self, etype, value, traceback):
1023         try:
1024             _unlock_file(self.f)
1025         finally:
1026             self.f.close()
1027
1028     def __iter__(self):
1029         return iter(self.f)
1030
1031     def write(self, *args):
1032         return self.f.write(*args)
1033
1034     def read(self, *args):
1035         return self.f.read(*args)
1036
1037
1038 def get_filesystem_encoding():
1039     encoding = sys.getfilesystemencoding()
1040     return encoding if encoding is not None else 'utf-8'
1041
1042
1043 def shell_quote(args):
1044     quoted_args = []
1045     encoding = get_filesystem_encoding()
1046     for a in args:
1047         if isinstance(a, bytes):
1048             # We may get a filename encoded with 'encodeFilename'
1049             a = a.decode(encoding)
1050         quoted_args.append(pipes.quote(a))
1051     return u' '.join(quoted_args)
1052
1053
1054 def takewhile_inclusive(pred, seq):
1055     """ Like itertools.takewhile, but include the latest evaluated element
1056         (the first element so that Not pred(e)) """
1057     for e in seq:
1058         yield e
1059         if not pred(e):
1060             return
1061
1062
1063 def smuggle_url(url, data):
1064     """ Pass additional data in a URL for internal use. """
1065
1066     sdata = compat_urllib_parse.urlencode(
1067         {u'__youtubedl_smuggle': json.dumps(data)})
1068     return url + u'#' + sdata
1069
1070
1071 def unsmuggle_url(smug_url, default=None):
1072     if not '#__youtubedl_smuggle' in smug_url:
1073         return smug_url, default
1074     url, _, sdata = smug_url.rpartition(u'#')
1075     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1076     data = json.loads(jsond)
1077     return url, data
1078
1079
1080 def format_bytes(bytes):
1081     if bytes is None:
1082         return u'N/A'
1083     if type(bytes) is str:
1084         bytes = float(bytes)
1085     if bytes == 0.0:
1086         exponent = 0
1087     else:
1088         exponent = int(math.log(bytes, 1024.0))
1089     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1090     converted = float(bytes) / float(1024 ** exponent)
1091     return u'%.2f%s' % (converted, suffix)
1092
1093
1094 def get_term_width():
1095     columns = compat_getenv('COLUMNS', None)
1096     if columns:
1097         return int(columns)
1098
1099     try:
1100         sp = subprocess.Popen(
1101             ['stty', 'size'],
1102             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1103         out, err = sp.communicate()
1104         return int(out.split()[1])
1105     except:
1106         pass
1107     return None
1108
1109
1110 def month_by_name(name):
1111     """ Return the number of a month by (locale-independently) English name """
1112
1113     ENGLISH_NAMES = [
1114         u'January', u'February', u'March', u'April', u'May', u'June',
1115         u'July', u'August', u'September', u'October', u'November', u'December']
1116     try:
1117         return ENGLISH_NAMES.index(name) + 1
1118     except ValueError:
1119         return None
1120
1121
1122 def fix_xml_ampersands(xml_str):
1123     """Replace all the '&' by '&amp;' in XML"""
1124     return re.sub(
1125         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1126         u'&amp;',
1127         xml_str)
1128
1129
1130 def setproctitle(title):
1131     assert isinstance(title, compat_str)
1132     try:
1133         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1134     except OSError:
1135         return
1136     title_bytes = title.encode('utf-8')
1137     buf = ctypes.create_string_buffer(len(title_bytes))
1138     buf.value = title_bytes
1139     try:
1140         libc.prctl(15, buf, 0, 0, 0)
1141     except AttributeError:
1142         return  # Strange libc, just skip this
1143
1144
1145 def remove_start(s, start):
1146     if s.startswith(start):
1147         return s[len(start):]
1148     return s
1149
1150
1151 def remove_end(s, end):
1152     if s.endswith(end):
1153         return s[:-len(end)]
1154     return s
1155
1156
1157 def url_basename(url):
1158     path = compat_urlparse.urlparse(url).path
1159     return path.strip(u'/').split(u'/')[-1]
1160
1161
1162 class HEADRequest(compat_urllib_request.Request):
1163     def get_method(self):
1164         return "HEAD"
1165
1166
1167 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1168     if get_attr:
1169         if v is not None:
1170             v = getattr(v, get_attr, None)
1171     if v == '':
1172         v = None
1173     return default if v is None else (int(v) * invscale // scale)
1174
1175
1176 def str_or_none(v, default=None):
1177     return default if v is None else compat_str(v)
1178
1179
1180 def str_to_int(int_str):
1181     """ A more relaxed version of int_or_none """
1182     if int_str is None:
1183         return None
1184     int_str = re.sub(r'[,\.\+]', u'', int_str)
1185     return int(int_str)
1186
1187
1188 def float_or_none(v, scale=1, invscale=1, default=None):
1189     return default if v is None else (float(v) * invscale / scale)
1190
1191
1192 def parse_duration(s):
1193     if s is None:
1194         return None
1195
1196     s = s.strip()
1197
1198     m = re.match(
1199         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1200     if not m:
1201         return None
1202     res = int(m.group('secs'))
1203     if m.group('mins'):
1204         res += int(m.group('mins')) * 60
1205         if m.group('hours'):
1206             res += int(m.group('hours')) * 60 * 60
1207     if m.group('ms'):
1208         res += float(m.group('ms'))
1209     return res
1210
1211
1212 def prepend_extension(filename, ext):
1213     name, real_ext = os.path.splitext(filename)
1214     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1215
1216
1217 def check_executable(exe, args=[]):
1218     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1219     args can be a list of arguments for a short output (like -version) """
1220     try:
1221         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1222     except OSError:
1223         return False
1224     return exe
1225
1226
1227 def get_exe_version(exe, args=['--version'],
1228                     version_re=r'version\s+([0-9._-a-zA-Z]+)',
1229                     unrecognized=u'present'):
1230     """ Returns the version of the specified executable,
1231     or False if the executable is not present """
1232     try:
1233         out, err = subprocess.Popen(
1234             [exe] + args,
1235             stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1236     except OSError:
1237         return False
1238     firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1239     m = re.search(version_re, firstline)
1240     if m:
1241         return m.group(1)
1242     else:
1243         return unrecognized
1244
1245
1246 class PagedList(object):
1247     def __len__(self):
1248         # This is only useful for tests
1249         return len(self.getslice())
1250
1251
1252 class OnDemandPagedList(PagedList):
1253     def __init__(self, pagefunc, pagesize):
1254         self._pagefunc = pagefunc
1255         self._pagesize = pagesize
1256
1257     def getslice(self, start=0, end=None):
1258         res = []
1259         for pagenum in itertools.count(start // self._pagesize):
1260             firstid = pagenum * self._pagesize
1261             nextfirstid = pagenum * self._pagesize + self._pagesize
1262             if start >= nextfirstid:
1263                 continue
1264
1265             page_results = list(self._pagefunc(pagenum))
1266
1267             startv = (
1268                 start % self._pagesize
1269                 if firstid <= start < nextfirstid
1270                 else 0)
1271
1272             endv = (
1273                 ((end - 1) % self._pagesize) + 1
1274                 if (end is not None and firstid <= end <= nextfirstid)
1275                 else None)
1276
1277             if startv != 0 or endv is not None:
1278                 page_results = page_results[startv:endv]
1279             res.extend(page_results)
1280
1281             # A little optimization - if current page is not "full", ie. does
1282             # not contain page_size videos then we can assume that this page
1283             # is the last one - there are no more ids on further pages -
1284             # i.e. no need to query again.
1285             if len(page_results) + startv < self._pagesize:
1286                 break
1287
1288             # If we got the whole page, but the next page is not interesting,
1289             # break out early as well
1290             if end == nextfirstid:
1291                 break
1292         return res
1293
1294
1295 class InAdvancePagedList(PagedList):
1296     def __init__(self, pagefunc, pagecount, pagesize):
1297         self._pagefunc = pagefunc
1298         self._pagecount = pagecount
1299         self._pagesize = pagesize
1300
1301     def getslice(self, start=0, end=None):
1302         res = []
1303         start_page = start // self._pagesize
1304         end_page = (
1305             self._pagecount if end is None else (end // self._pagesize + 1))
1306         skip_elems = start - start_page * self._pagesize
1307         only_more = None if end is None else end - start
1308         for pagenum in range(start_page, end_page):
1309             page = list(self._pagefunc(pagenum))
1310             if skip_elems:
1311                 page = page[skip_elems:]
1312                 skip_elems = None
1313             if only_more is not None:
1314                 if len(page) < only_more:
1315                     only_more -= len(page)
1316                 else:
1317                     page = page[:only_more]
1318                     res.extend(page)
1319                     break
1320             res.extend(page)
1321         return res
1322
1323
1324 def uppercase_escape(s):
1325     unicode_escape = codecs.getdecoder('unicode_escape')
1326     return re.sub(
1327         r'\\U[0-9a-fA-F]{8}',
1328         lambda m: unicode_escape(m.group(0))[0],
1329         s)
1330
1331
1332 def escape_rfc3986(s):
1333     """Escape non-ASCII characters as suggested by RFC 3986"""
1334     if sys.version_info < (3, 0) and isinstance(s, unicode):
1335         s = s.encode('utf-8')
1336     return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1337
1338
1339 def escape_url(url):
1340     """Escape URL as suggested by RFC 3986"""
1341     url_parsed = compat_urllib_parse_urlparse(url)
1342     return url_parsed._replace(
1343         path=escape_rfc3986(url_parsed.path),
1344         params=escape_rfc3986(url_parsed.params),
1345         query=escape_rfc3986(url_parsed.query),
1346         fragment=escape_rfc3986(url_parsed.fragment)
1347     ).geturl()
1348
1349 try:
1350     struct.pack(u'!I', 0)
1351 except TypeError:
1352     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1353     def struct_pack(spec, *args):
1354         if isinstance(spec, compat_str):
1355             spec = spec.encode('ascii')
1356         return struct.pack(spec, *args)
1357
1358     def struct_unpack(spec, *args):
1359         if isinstance(spec, compat_str):
1360             spec = spec.encode('ascii')
1361         return struct.unpack(spec, *args)
1362 else:
1363     struct_pack = struct.pack
1364     struct_unpack = struct.unpack
1365
1366
1367 def read_batch_urls(batch_fd):
1368     def fixup(url):
1369         if not isinstance(url, compat_str):
1370             url = url.decode('utf-8', 'replace')
1371         BOM_UTF8 = u'\xef\xbb\xbf'
1372         if url.startswith(BOM_UTF8):
1373             url = url[len(BOM_UTF8):]
1374         url = url.strip()
1375         if url.startswith(('#', ';', ']')):
1376             return False
1377         return url
1378
1379     with contextlib.closing(batch_fd) as fd:
1380         return [url for url in map(fixup, fd) if url]
1381
1382
1383 def urlencode_postdata(*args, **kargs):
1384     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1385
1386
1387 try:
1388     etree_iter = xml.etree.ElementTree.Element.iter
1389 except AttributeError:  # Python <=2.6
1390     etree_iter = lambda n: n.findall('.//*')
1391
1392
1393 def parse_xml(s):
1394     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1395         def doctype(self, name, pubid, system):
1396             pass  # Ignore doctypes
1397
1398     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1399     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1400     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1401     # Fix up XML parser in Python 2.x
1402     if sys.version_info < (3, 0):
1403         for n in etree_iter(tree):
1404             if n.text is not None:
1405                 if not isinstance(n.text, compat_str):
1406                     n.text = n.text.decode('utf-8')
1407     return tree
1408
1409
1410 US_RATINGS = {
1411     'G': 0,
1412     'PG': 10,
1413     'PG-13': 13,
1414     'R': 16,
1415     'NC': 18,
1416 }
1417
1418
1419 def parse_age_limit(s):
1420     if s is None:
1421         return None
1422     m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1423     return int(m.group('age')) if m else US_RATINGS.get(s, None)
1424
1425
1426 def strip_jsonp(code):
1427     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1428
1429
1430 def js_to_json(code):
1431     def fix_kv(m):
1432         v = m.group(0)
1433         if v in ('true', 'false', 'null'):
1434             return v
1435         if v.startswith('"'):
1436             return v
1437         if v.startswith("'"):
1438             v = v[1:-1]
1439             v = re.sub(r"\\\\|\\'|\"", lambda m: {
1440                 '\\\\': '\\\\',
1441                 "\\'": "'",
1442                 '"': '\\"',
1443             }[m.group(0)], v)
1444         return '"%s"' % v
1445
1446     res = re.sub(r'''(?x)
1447         "(?:[^"\\]*(?:\\\\|\\")?)*"|
1448         '(?:[^'\\]*(?:\\\\|\\')?)*'|
1449         [a-zA-Z_][a-zA-Z_0-9]*
1450         ''', fix_kv, code)
1451     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1452     return res
1453
1454
1455 def qualities(quality_ids):
1456     """ Get a numeric quality value out of a list of possible values """
1457     def q(qid):
1458         try:
1459             return quality_ids.index(qid)
1460         except ValueError:
1461             return -1
1462     return q
1463
1464
1465 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1466
1467
1468 def limit_length(s, length):
1469     """ Add ellipses to overly long strings """
1470     if s is None:
1471         return None
1472     ELLIPSES = '...'
1473     if len(s) > length:
1474         return s[:length - len(ELLIPSES)] + ELLIPSES
1475     return s
1476
1477
1478 def version_tuple(v):
1479     return [int(e) for e in v.split('.')]
1480
1481
1482 def is_outdated_version(version, limit, assume_new=True):
1483     if not version:
1484         return not assume_new
1485     try:
1486         return version_tuple(version) < version_tuple(limit)
1487     except ValueError:
1488         return not assume_new