youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import codecs
   6 import contextlib
   7 import ctypes
   8 import datetime
   9 import email.utils
  10 import errno
  11 import getpass
  12 import gzip
  13 import itertools
  14 import io
  15 import json
  16 import locale
  17 import math
  18 import os
  19 import pipes
  20 import platform
  21 import re
  22 import ssl
  23 import socket
  24 import struct
  25 import subprocess
  26 import sys
  27 import tempfile
  28 import traceback
  29 import xml.etree.ElementTree
  30 import zlib
  31
  32 try:
  33     import urllib.request as compat_urllib_request
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_request
  36
  37 try:
  38     import urllib.error as compat_urllib_error
  39 except ImportError: # Python 2
  40     import urllib2 as compat_urllib_error
  41
  42 try:
  43     import urllib.parse as compat_urllib_parse
  44 except ImportError: # Python 2
  45     import urllib as compat_urllib_parse
  46
  47 try:
  48     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  49 except ImportError: # Python 2
  50     from urlparse import urlparse as compat_urllib_parse_urlparse
  51
  52 try:
  53     import urllib.parse as compat_urlparse
  54 except ImportError: # Python 2
  55     import urlparse as compat_urlparse
  56
  57 try:
  58     import http.cookiejar as compat_cookiejar
  59 except ImportError: # Python 2
  60     import cookielib as compat_cookiejar
  61
  62 try:
  63     import html.entities as compat_html_entities
  64 except ImportError: # Python 2
  65     import htmlentitydefs as compat_html_entities
  66
  67 try:
  68     import html.parser as compat_html_parser
  69 except ImportError: # Python 2
  70     import HTMLParser as compat_html_parser
  71
  72 try:
  73     import http.client as compat_http_client
  74 except ImportError: # Python 2
  75     import httplib as compat_http_client
  76
  77 try:
  78     from urllib.error import HTTPError as compat_HTTPError
  79 except ImportError:  # Python 2
  80     from urllib2 import HTTPError as compat_HTTPError
  81
  82 try:
  83     from urllib.request import urlretrieve as compat_urlretrieve
  84 except ImportError:  # Python 2
  85     from urllib import urlretrieve as compat_urlretrieve
  86
  87
  88 try:
  89     from subprocess import DEVNULL
  90     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  91 except ImportError:
  92     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  93
  94 try:
  95     from urllib.parse import unquote as compat_urllib_parse_unquote
  96 except ImportError:
  97     def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132
 133 try:
 134     from urllib.parse import parse_qs as compat_parse_qs
 135 except ImportError: # Python 2
 136     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
 137     # Python 2's version is apparently totally broken
 138
 139     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 140                 encoding='utf-8', errors='replace'):
 141         qs, _coerce_result = qs, unicode
 142         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 143         r = []
 144         for name_value in pairs:
 145             if not name_value and not strict_parsing:
 146                 continue
 147             nv = name_value.split('=', 1)
 148             if len(nv) != 2:
 149                 if strict_parsing:
 150                     raise ValueError("bad query field: %r" % (name_value,))
 151                 # Handle case of a control-name with no equal sign
 152                 if keep_blank_values:
 153                     nv.append('')
 154                 else:
 155                     continue
 156             if len(nv[1]) or keep_blank_values:
 157                 name = nv[0].replace('+', ' ')
 158                 name = compat_urllib_parse_unquote(
 159                     name, encoding=encoding, errors=errors)
 160                 name = _coerce_result(name)
 161                 value = nv[1].replace('+', ' ')
 162                 value = compat_urllib_parse_unquote(
 163                     value, encoding=encoding, errors=errors)
 164                 value = _coerce_result(value)
 165                 r.append((name, value))
 166         return r
 167
 168     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 169                 encoding='utf-8', errors='replace'):
 170         parsed_result = {}
 171         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 172                         encoding=encoding, errors=errors)
 173         for name, value in pairs:
 174             if name in parsed_result:
 175                 parsed_result[name].append(value)
 176             else:
 177                 parsed_result[name] = [value]
 178         return parsed_result
 179
 180 try:
 181     compat_str = unicode # Python 2
 182 except NameError:
 183     compat_str = str
 184
 185 try:
 186     compat_chr = unichr # Python 2
 187 except NameError:
 188     compat_chr = chr
 189
 190 try:
 191     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 192 except ImportError:  # Python 2.6
 193     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 194
 195 try:
 196     from shlex import quote as shlex_quote
 197 except ImportError:  # Python < 3.3
 198     def shlex_quote(s):
 199         return "'" + s.replace("'", "'\"'\"'") + "'"
 200
 201
 202 def compat_ord(c):
 203     if type(c) is int: return c
 204     else: return ord(c)
 205
 206 # This is not clearly defined otherwise
 207 compiled_regex_type = type(re.compile(''))
 208
 209 std_headers = {
 210     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 211     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 212     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 213     'Accept-Encoding': 'gzip, deflate',
 214     'Accept-Language': 'en-us,en;q=0.5',
 215 }
 216
 217 def preferredencoding():
 218     """Get preferred encoding.
 219
 220     Returns the best encoding scheme for the system, based on
 221     locale.getpreferredencoding() and some further tweaks.
 222     """
 223     try:
 224         pref = locale.getpreferredencoding()
 225         u'TEST'.encode(pref)
 226     except:
 227         pref = 'UTF-8'
 228
 229     return pref
 230
 231 if sys.version_info < (3,0):
 232     def compat_print(s):
 233         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 234 else:
 235     def compat_print(s):
 236         assert type(s) == type(u'')
 237         print(s)
 238
 239
 240 def write_json_file(obj, fn):
 241     """ Encode obj as JSON and write it to fn, atomically """
 242
 243     args = {
 244         'suffix': '.tmp',
 245         'prefix': os.path.basename(fn) + '.',
 246         'dir': os.path.dirname(fn),
 247         'delete': False,
 248     }
 249
 250     # In Python 2.x, json.dump expects a bytestream.
 251     # In Python 3.x, it writes to a character stream
 252     if sys.version_info < (3, 0):
 253         args['mode'] = 'wb'
 254     else:
 255         args.update({
 256             'mode': 'w',
 257             'encoding': 'utf-8',
 258         })
 259
 260     tf = tempfile.NamedTemporaryFile(**args)
 261
 262     try:
 263         with tf:
 264             json.dump(obj, tf)
 265         os.rename(tf.name, fn)
 266     except:
 267         try:
 268             os.remove(tf.name)
 269         except OSError:
 270             pass
 271         raise
 272
 273
 274 if sys.version_info >= (2, 7):
 275     def find_xpath_attr(node, xpath, key, val):
 276         """ Find the xpath xpath[@key=val] """
 277         assert re.match(r'^[a-zA-Z-]+$', key)
 278         assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
 279         expr = xpath + u"[@%s='%s']" % (key, val)
 280         return node.find(expr)
 281 else:
 282     def find_xpath_attr(node, xpath, key, val):
 283         # Here comes the crazy part: In 2.6, if the xpath is a unicode,
 284         # .//node does not match if a node is a direct child of . !
 285         if isinstance(xpath, unicode):
 286             xpath = xpath.encode('ascii')
 287
 288         for f in node.findall(xpath):
 289             if f.attrib.get(key) == val:
 290                 return f
 291         return None
 292
 293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 294 # the namespace parameter
 295 def xpath_with_ns(path, ns_map):
 296     components = [c.split(':') for c in path.split('/')]
 297     replaced = []
 298     for c in components:
 299         if len(c) == 1:
 300             replaced.append(c[0])
 301         else:
 302             ns, tag = c
 303             replaced.append('{%s}%s' % (ns_map[ns], tag))
 304     return '/'.join(replaced)
 305
 306
 307 def xpath_text(node, xpath, name=None, fatal=False):
 308     if sys.version_info < (2, 7):  # Crazy 2.6
 309         xpath = xpath.encode('ascii')
 310
 311     n = node.find(xpath)
 312     if n is None:
 313         if fatal:
 314             name = xpath if name is None else name
 315             raise ExtractorError('Could not find XML element %s' % name)
 316         else:
 317             return None
 318     return n.text
 319
 320
 321 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 322 class BaseHTMLParser(compat_html_parser.HTMLParser):
 323     def __init(self):
 324         compat_html_parser.HTMLParser.__init__(self)
 325         self.html = None
 326
 327     def loads(self, html):
 328         self.html = html
 329         self.feed(html)
 330         self.close()
 331
 332 class AttrParser(BaseHTMLParser):
 333     """Modified HTMLParser that isolates a tag with the specified attribute"""
 334     def __init__(self, attribute, value):
 335         self.attribute = attribute
 336         self.value = value
 337         self.result = None
 338         self.started = False
 339         self.depth = {}
 340         self.watch_startpos = False
 341         self.error_count = 0
 342         BaseHTMLParser.__init__(self)
 343
 344     def error(self, message):
 345         if self.error_count > 10 or self.started:
 346             raise compat_html_parser.HTMLParseError(message, self.getpos())
 347         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 348         self.error_count += 1
 349         self.goahead(1)
 350
 351     def handle_starttag(self, tag, attrs):
 352         attrs = dict(attrs)
 353         if self.started:
 354             self.find_startpos(None)
 355         if self.attribute in attrs and attrs[self.attribute] == self.value:
 356             self.result = [tag]
 357             self.started = True
 358             self.watch_startpos = True
 359         if self.started:
 360             if not tag in self.depth: self.depth[tag] = 0
 361             self.depth[tag] += 1
 362
 363     def handle_endtag(self, tag):
 364         if self.started:
 365             if tag in self.depth: self.depth[tag] -= 1
 366             if self.depth[self.result[0]] == 0:
 367                 self.started = False
 368                 self.result.append(self.getpos())
 369
 370     def find_startpos(self, x):
 371         """Needed to put the start position of the result (self.result[1])
 372         after the opening tag with the requested id"""
 373         if self.watch_startpos:
 374             self.watch_startpos = False
 375             self.result.append(self.getpos())
 376     handle_entityref = handle_charref = handle_data = handle_comment = \
 377     handle_decl = handle_pi = unknown_decl = find_startpos
 378
 379     def get_result(self):
 380         if self.result is None:
 381             return None
 382         if len(self.result) != 3:
 383             return None
 384         lines = self.html.split('\n')
 385         lines = lines[self.result[1][0]-1:self.result[2][0]]
 386         lines[0] = lines[0][self.result[1][1]:]
 387         if len(lines) == 1:
 388             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 389         lines[-1] = lines[-1][:self.result[2][1]]
 390         return '\n'.join(lines).strip()
 391 # Hack for https://github.com/rg3/youtube-dl/issues/662
 392 if sys.version_info < (2, 7, 3):
 393     AttrParser.parse_endtag = (lambda self, i:
 394         i + len("</scr'+'ipt>")
 395         if self.rawdata[i:].startswith("</scr'+'ipt>")
 396         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 397
 398 def get_element_by_id(id, html):
 399     """Return the content of the tag with the specified ID in the passed HTML document"""
 400     return get_element_by_attribute("id", id, html)
 401
 402 def get_element_by_attribute(attribute, value, html):
 403     """Return the content of the tag with the specified attribute in the passed HTML document"""
 404     parser = AttrParser(attribute, value)
 405     try:
 406         parser.loads(html)
 407     except compat_html_parser.HTMLParseError:
 408         pass
 409     return parser.get_result()
 410
 411 class MetaParser(BaseHTMLParser):
 412     """
 413     Modified HTMLParser that isolates a meta tag with the specified name
 414     attribute.
 415     """
 416     def __init__(self, name):
 417         BaseHTMLParser.__init__(self)
 418         self.name = name
 419         self.content = None
 420         self.result = None
 421
 422     def handle_starttag(self, tag, attrs):
 423         if tag != 'meta':
 424             return
 425         attrs = dict(attrs)
 426         if attrs.get('name') == self.name:
 427             self.result = attrs.get('content')
 428
 429     def get_result(self):
 430         return self.result
 431
 432 def get_meta_content(name, html):
 433     """
 434     Return the content attribute from the meta tag with the given name attribute.
 435     """
 436     parser = MetaParser(name)
 437     try:
 438         parser.loads(html)
 439     except compat_html_parser.HTMLParseError:
 440         pass
 441     return parser.get_result()
 442
 443
 444 def clean_html(html):
 445     """Clean an HTML snippet into a readable string"""
 446     # Newline vs <br />
 447     html = html.replace('\n', ' ')
 448     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 449     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 450     # Strip html tags
 451     html = re.sub('<.*?>', '', html)
 452     # Replace html entities
 453     html = unescapeHTML(html)
 454     return html.strip()
 455
 456
 457 def sanitize_open(filename, open_mode):
 458     """Try to open the given filename, and slightly tweak it if this fails.
 459
 460     Attempts to open the given filename. If this fails, it tries to change
 461     the filename slightly, step by step, until it's either able to open it
 462     or it fails and raises a final exception, like the standard open()
 463     function.
 464
 465     It returns the tuple (stream, definitive_file_name).
 466     """
 467     try:
 468         if filename == u'-':
 469             if sys.platform == 'win32':
 470                 import msvcrt
 471                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 472             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 473         stream = open(encodeFilename(filename), open_mode)
 474         return (stream, filename)
 475     except (IOError, OSError) as err:
 476         if err.errno in (errno.EACCES,):
 477             raise
 478
 479         # In case of error, try to remove win32 forbidden chars
 480         alt_filename = os.path.join(
 481                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 482                         for path_part in os.path.split(filename)
 483                        )
 484         if alt_filename == filename:
 485             raise
 486         else:
 487             # An exception here should be caught in the caller
 488             stream = open(encodeFilename(filename), open_mode)
 489             return (stream, alt_filename)
 490
 491
 492 def timeconvert(timestr):
 493     """Convert RFC 2822 defined time string into system timestamp"""
 494     timestamp = None
 495     timetuple = email.utils.parsedate_tz(timestr)
 496     if timetuple is not None:
 497         timestamp = email.utils.mktime_tz(timetuple)
 498     return timestamp
 499
 500 def sanitize_filename(s, restricted=False, is_id=False):
 501     """Sanitizes a string so it could be used as part of a filename.
 502     If restricted is set, use a stricter subset of allowed characters.
 503     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 504     """
 505     def replace_insane(char):
 506         if char == '?' or ord(char) < 32 or ord(char) == 127:
 507             return ''
 508         elif char == '"':
 509             return '' if restricted else '\''
 510         elif char == ':':
 511             return '_-' if restricted else ' -'
 512         elif char in '\\/|*<>':
 513             return '_'
 514         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 515             return '_'
 516         if restricted and ord(char) > 127:
 517             return '_'
 518         return char
 519
 520     result = u''.join(map(replace_insane, s))
 521     if not is_id:
 522         while '__' in result:
 523             result = result.replace('__', '_')
 524         result = result.strip('_')
 525         # Common case of "Foreign band name - English song title"
 526         if restricted and result.startswith('-_'):
 527             result = result[2:]
 528         if not result:
 529             result = '_'
 530     return result
 531
 532 def orderedSet(iterable):
 533     """ Remove all duplicates from the input iterable """
 534     res = []
 535     for el in iterable:
 536         if el not in res:
 537             res.append(el)
 538     return res
 539
 540
 541 def _htmlentity_transform(entity):
 542     """Transforms an HTML entity to a character."""
 543     # Known non-numeric HTML entity
 544     if entity in compat_html_entities.name2codepoint:
 545         return compat_chr(compat_html_entities.name2codepoint[entity])
 546
 547     mobj = re.match(r'#(x?[0-9]+)', entity)
 548     if mobj is not None:
 549         numstr = mobj.group(1)
 550         if numstr.startswith(u'x'):
 551             base = 16
 552             numstr = u'0%s' % numstr
 553         else:
 554             base = 10
 555         return compat_chr(int(numstr, base))
 556
 557     # Unknown entity in name, return its literal representation
 558     return (u'&%s;' % entity)
 559
 560
 561 def unescapeHTML(s):
 562     if s is None:
 563         return None
 564     assert type(s) == compat_str
 565
 566     return re.sub(
 567         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
 568
 569
 570 def encodeFilename(s, for_subprocess=False):
 571     """
 572     @param s The name of the file
 573     """
 574
 575     assert type(s) == compat_str
 576
 577     # Python 3 has a Unicode API
 578     if sys.version_info >= (3, 0):
 579         return s
 580
 581     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 582         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 583         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 584         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 585         if not for_subprocess:
 586             return s
 587         else:
 588             # For subprocess calls, encode with locale encoding
 589             # Refer to http://stackoverflow.com/a/9951851/35070
 590             encoding = preferredencoding()
 591     else:
 592         encoding = sys.getfilesystemencoding()
 593     if encoding is None:
 594         encoding = 'utf-8'
 595     return s.encode(encoding, 'ignore')
 596
 597
 598 def encodeArgument(s):
 599     if not isinstance(s, compat_str):
 600         # Legacy code that uses byte strings
 601         # Uncomment the following line after fixing all post processors
 602         #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
 603         s = s.decode('ascii')
 604     return encodeFilename(s, True)
 605
 606
 607 def decodeOption(optval):
 608     if optval is None:
 609         return optval
 610     if isinstance(optval, bytes):
 611         optval = optval.decode(preferredencoding())
 612
 613     assert isinstance(optval, compat_str)
 614     return optval
 615
 616 def formatSeconds(secs):
 617     if secs > 3600:
 618         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 619     elif secs > 60:
 620         return '%d:%02d' % (secs // 60, secs % 60)
 621     else:
 622         return '%d' % secs
 623
 624
 625 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 626     if sys.version_info < (3, 2):
 627         import httplib
 628
 629         class HTTPSConnectionV3(httplib.HTTPSConnection):
 630             def __init__(self, *args, **kwargs):
 631                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 632
 633             def connect(self):
 634                 sock = socket.create_connection((self.host, self.port), self.timeout)
 635                 if getattr(self, '_tunnel_host', False):
 636                     self.sock = sock
 637                     self._tunnel()
 638                 try:
 639                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
 640                 except ssl.SSLError:
 641                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 642
 643         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 644             def https_open(self, req):
 645                 return self.do_open(HTTPSConnectionV3, req)
 646         return HTTPSHandlerV3(**kwargs)
 647     elif hasattr(ssl, 'create_default_context'):  # Python >= 3.4
 648         context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
 649         context.options &= ~ssl.OP_NO_SSLv3  # Allow older, not-as-secure SSLv3
 650         if opts_no_check_certificate:
 651             context.verify_mode = ssl.CERT_NONE
 652         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 653     else:  # Python < 3.4
 654         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 655         context.verify_mode = (ssl.CERT_NONE
 656                                if opts_no_check_certificate
 657                                else ssl.CERT_REQUIRED)
 658         context.set_default_verify_paths()
 659         try:
 660             context.load_default_certs()
 661         except AttributeError:
 662             pass  # Python < 3.4
 663         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 664
 665 class ExtractorError(Exception):
 666     """Error during info extraction."""
 667     def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
 668         """ tb, if given, is the original traceback (so that it can be printed out).
 669         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 670         """
 671
 672         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 673             expected = True
 674         if video_id is not None:
 675             msg = video_id + ': ' + msg
 676         if not expected:
 677             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 678         super(ExtractorError, self).__init__(msg)
 679
 680         self.traceback = tb
 681         self.exc_info = sys.exc_info()  # preserve original exception
 682         self.cause = cause
 683         self.video_id = video_id
 684
 685     def format_traceback(self):
 686         if self.traceback is None:
 687             return None
 688         return u''.join(traceback.format_tb(self.traceback))
 689
 690
 691 class RegexNotFoundError(ExtractorError):
 692     """Error when a regex didn't match"""
 693     pass
 694
 695
 696 class DownloadError(Exception):
 697     """Download Error exception.
 698
 699     This exception may be thrown by FileDownloader objects if they are not
 700     configured to continue on errors. They will contain the appropriate
 701     error message.
 702     """
 703     def __init__(self, msg, exc_info=None):
 704         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 705         super(DownloadError, self).__init__(msg)
 706         self.exc_info = exc_info
 707
 708
 709 class SameFileError(Exception):
 710     """Same File exception.
 711
 712     This exception will be thrown by FileDownloader objects if they detect
 713     multiple files would have to be downloaded to the same file on disk.
 714     """
 715     pass
 716
 717
 718 class PostProcessingError(Exception):
 719     """Post Processing exception.
 720
 721     This exception may be raised by PostProcessor's .run() method to
 722     indicate an error in the postprocessing task.
 723     """
 724     def __init__(self, msg):
 725         self.msg = msg
 726
 727 class MaxDownloadsReached(Exception):
 728     """ --max-downloads limit has been reached. """
 729     pass
 730
 731
 732 class UnavailableVideoError(Exception):
 733     """Unavailable Format exception.
 734
 735     This exception will be thrown when a video is requested
 736     in a format that is not available for that video.
 737     """
 738     pass
 739
 740
 741 class ContentTooShortError(Exception):
 742     """Content Too Short exception.
 743
 744     This exception may be raised by FileDownloader objects when a file they
 745     download is too small for what the server announced first, indicating
 746     the connection was probably interrupted.
 747     """
 748     # Both in bytes
 749     downloaded = None
 750     expected = None
 751
 752     def __init__(self, downloaded, expected):
 753         self.downloaded = downloaded
 754         self.expected = expected
 755
 756 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 757     """Handler for HTTP requests and responses.
 758
 759     This class, when installed with an OpenerDirector, automatically adds
 760     the standard headers to every HTTP request and handles gzipped and
 761     deflated responses from web servers. If compression is to be avoided in
 762     a particular request, the original request in the program code only has
 763     to include the HTTP header "Youtubedl-No-Compression", which will be
 764     removed before making the real request.
 765
 766     Part of this code was copied from:
 767
 768     http://techknack.net/python-urllib2-handlers/
 769
 770     Andrew Rowls, the author of that code, agreed to release it to the
 771     public domain.
 772     """
 773
 774     @staticmethod
 775     def deflate(data):
 776         try:
 777             return zlib.decompress(data, -zlib.MAX_WBITS)
 778         except zlib.error:
 779             return zlib.decompress(data)
 780
 781     @staticmethod
 782     def addinfourl_wrapper(stream, headers, url, code):
 783         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 784             return compat_urllib_request.addinfourl(stream, headers, url, code)
 785         ret = compat_urllib_request.addinfourl(stream, headers, url)
 786         ret.code = code
 787         return ret
 788
 789     def http_request(self, req):
 790         for h, v in std_headers.items():
 791             if h not in req.headers:
 792                 req.add_header(h, v)
 793         if 'Youtubedl-no-compression' in req.headers:
 794             if 'Accept-encoding' in req.headers:
 795                 del req.headers['Accept-encoding']
 796             del req.headers['Youtubedl-no-compression']
 797         if 'Youtubedl-user-agent' in req.headers:
 798             if 'User-agent' in req.headers:
 799                 del req.headers['User-agent']
 800             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 801             del req.headers['Youtubedl-user-agent']
 802
 803         if sys.version_info < (2, 7) and '#' in req.get_full_url():
 804             # Python 2.6 is brain-dead when it comes to fragments
 805             req._Request__original = req._Request__original.partition('#')[0]
 806             req._Request__r_type = req._Request__r_type.partition('#')[0]
 807
 808         return req
 809
 810     def http_response(self, req, resp):
 811         old_resp = resp
 812         # gzip
 813         if resp.headers.get('Content-encoding', '') == 'gzip':
 814             content = resp.read()
 815             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 816             try:
 817                 uncompressed = io.BytesIO(gz.read())
 818             except IOError as original_ioerror:
 819                 # There may be junk add the end of the file
 820                 # See http://stackoverflow.com/q/4928560/35070 for details
 821                 for i in range(1, 1024):
 822                     try:
 823                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 824                         uncompressed = io.BytesIO(gz.read())
 825                     except IOError:
 826                         continue
 827                     break
 828                 else:
 829                     raise original_ioerror
 830             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 831             resp.msg = old_resp.msg
 832         # deflate
 833         if resp.headers.get('Content-encoding', '') == 'deflate':
 834             gz = io.BytesIO(self.deflate(resp.read()))
 835             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 836             resp.msg = old_resp.msg
 837         return resp
 838
 839     https_request = http_request
 840     https_response = http_response
 841
 842
 843 def parse_iso8601(date_str, delimiter='T'):
 844     """ Return a UNIX timestamp from the given date """
 845
 846     if date_str is None:
 847         return None
 848
 849     m = re.search(
 850         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 851         date_str)
 852     if not m:
 853         timezone = datetime.timedelta()
 854     else:
 855         date_str = date_str[:-len(m.group(0))]
 856         if not m.group('sign'):
 857             timezone = datetime.timedelta()
 858         else:
 859             sign = 1 if m.group('sign') == '+' else -1
 860             timezone = datetime.timedelta(
 861                 hours=sign * int(m.group('hours')),
 862                 minutes=sign * int(m.group('minutes')))
 863     date_format =  '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
 864     dt = datetime.datetime.strptime(date_str, date_format) - timezone
 865     return calendar.timegm(dt.timetuple())
 866
 867
 868 def unified_strdate(date_str):
 869     """Return a string with the date in the format YYYYMMDD"""
 870
 871     if date_str is None:
 872         return None
 873
 874     upload_date = None
 875     #Replace commas
 876     date_str = date_str.replace(',', ' ')
 877     # %z (UTC offset) is only supported in python>=3.2
 878     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 879     format_expressions = [
 880         '%d %B %Y',
 881         '%d %b %Y',
 882         '%B %d %Y',
 883         '%b %d %Y',
 884         '%b %dst %Y %I:%M%p',
 885         '%b %dnd %Y %I:%M%p',
 886         '%b %dth %Y %I:%M%p',
 887         '%Y-%m-%d',
 888         '%Y/%m/%d',
 889         '%d.%m.%Y',
 890         '%d/%m/%Y',
 891         '%d/%m/%y',
 892         '%Y/%m/%d %H:%M:%S',
 893         '%d/%m/%Y %H:%M:%S',
 894         '%Y-%m-%d %H:%M:%S',
 895         '%d.%m.%Y %H:%M',
 896         '%d.%m.%Y %H.%M',
 897         '%Y-%m-%dT%H:%M:%SZ',
 898         '%Y-%m-%dT%H:%M:%S.%fZ',
 899         '%Y-%m-%dT%H:%M:%S.%f0Z',
 900         '%Y-%m-%dT%H:%M:%S',
 901         '%Y-%m-%dT%H:%M:%S.%f',
 902         '%Y-%m-%dT%H:%M',
 903     ]
 904     for expression in format_expressions:
 905         try:
 906             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 907         except ValueError:
 908             pass
 909     if upload_date is None:
 910         timetuple = email.utils.parsedate_tz(date_str)
 911         if timetuple:
 912             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 913     return upload_date
 914
 915 def determine_ext(url, default_ext=u'unknown_video'):
 916     if url is None:
 917         return default_ext
 918     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 919     if re.match(r'^[A-Za-z0-9]+$', guess):
 920         return guess
 921     else:
 922         return default_ext
 923
 924 def subtitles_filename(filename, sub_lang, sub_format):
 925     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 926
 927 def date_from_str(date_str):
 928     """
 929     Return a datetime object from a string in the format YYYYMMDD or
 930     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 931     today = datetime.date.today()
 932     if date_str == 'now'or date_str == 'today':
 933         return today
 934     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 935     if match is not None:
 936         sign = match.group('sign')
 937         time = int(match.group('time'))
 938         if sign == '-':
 939             time = -time
 940         unit = match.group('unit')
 941         #A bad aproximation?
 942         if unit == 'month':
 943             unit = 'day'
 944             time *= 30
 945         elif unit == 'year':
 946             unit = 'day'
 947             time *= 365
 948         unit += 's'
 949         delta = datetime.timedelta(**{unit: time})
 950         return today + delta
 951     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 952
 953 def hyphenate_date(date_str):
 954     """
 955     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 956     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 957     if match is not None:
 958         return '-'.join(match.groups())
 959     else:
 960         return date_str
 961
 962 class DateRange(object):
 963     """Represents a time interval between two dates"""
 964     def __init__(self, start=None, end=None):
 965         """start and end must be strings in the format accepted by date"""
 966         if start is not None:
 967             self.start = date_from_str(start)
 968         else:
 969             self.start = datetime.datetime.min.date()
 970         if end is not None:
 971             self.end = date_from_str(end)
 972         else:
 973             self.end = datetime.datetime.max.date()
 974         if self.start > self.end:
 975             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 976     @classmethod
 977     def day(cls, day):
 978         """Returns a range that only contains the given day"""
 979         return cls(day,day)
 980     def __contains__(self, date):
 981         """Check if the date is in the range"""
 982         if not isinstance(date, datetime.date):
 983             date = date_from_str(date)
 984         return self.start <= date <= self.end
 985     def __str__(self):
 986         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 987
 988
 989 def platform_name():
 990     """ Returns the platform name as a compat_str """
 991     res = platform.platform()
 992     if isinstance(res, bytes):
 993         res = res.decode(preferredencoding())
 994
 995     assert isinstance(res, compat_str)
 996     return res
 997
 998
 999 def _windows_write_string(s, out):
1000     """ Returns True if the string was written using special methods,
1001     False if it has yet to be written out."""
1002     # Adapted from http://stackoverflow.com/a/3259271/35070
1003
1004     import ctypes
1005     import ctypes.wintypes
1006
1007     WIN_OUTPUT_IDS = {
1008         1: -11,
1009         2: -12,
1010     }
1011
1012     try:
1013         fileno = out.fileno()
1014     except AttributeError:
1015         # If the output stream doesn't have a fileno, it's virtual
1016         return False
1017     if fileno not in WIN_OUTPUT_IDS:
1018         return False
1019
1020     GetStdHandle = ctypes.WINFUNCTYPE(
1021         ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1022         ("GetStdHandle", ctypes.windll.kernel32))
1023     h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1024
1025     WriteConsoleW = ctypes.WINFUNCTYPE(
1026         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1027         ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1028         ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1029     written = ctypes.wintypes.DWORD(0)
1030
1031     GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1032     FILE_TYPE_CHAR = 0x0002
1033     FILE_TYPE_REMOTE = 0x8000
1034     GetConsoleMode = ctypes.WINFUNCTYPE(
1035         ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1036         ctypes.POINTER(ctypes.wintypes.DWORD))(
1037         ("GetConsoleMode", ctypes.windll.kernel32))
1038     INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1039
1040     def not_a_console(handle):
1041         if handle == INVALID_HANDLE_VALUE or handle is None:
1042             return True
1043         return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1044                 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1045
1046     if not_a_console(h):
1047         return False
1048
1049     def next_nonbmp_pos(s):
1050         try:
1051             return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1052         except StopIteration:
1053             return len(s)
1054
1055     while s:
1056         count = min(next_nonbmp_pos(s), 1024)
1057
1058         ret = WriteConsoleW(
1059             h, s, count if count else 2, ctypes.byref(written), None)
1060         if ret == 0:
1061             raise OSError('Failed to write string')
1062         if not count:  # We just wrote a non-BMP character
1063             assert written.value == 2
1064             s = s[1:]
1065         else:
1066             assert written.value > 0
1067             s = s[written.value:]
1068     return True
1069
1070
1071 def write_string(s, out=None, encoding=None):
1072     if out is None:
1073         out = sys.stderr
1074     assert type(s) == compat_str
1075
1076     if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1077         if _windows_write_string(s, out):
1078             return
1079
1080     if ('b' in getattr(out, 'mode', '') or
1081             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
1082         byt = s.encode(encoding or preferredencoding(), 'ignore')
1083         out.write(byt)
1084     elif hasattr(out, 'buffer'):
1085         enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1086         byt = s.encode(enc, 'ignore')
1087         out.buffer.write(byt)
1088     else:
1089         out.write(s)
1090     out.flush()
1091
1092
1093 def bytes_to_intlist(bs):
1094     if not bs:
1095         return []
1096     if isinstance(bs[0], int):  # Python 3
1097         return list(bs)
1098     else:
1099         return [ord(c) for c in bs]
1100
1101
1102 def intlist_to_bytes(xs):
1103     if not xs:
1104         return b''
1105     if isinstance(chr(0), bytes):  # Python 2
1106         return ''.join([chr(x) for x in xs])
1107     else:
1108         return bytes(xs)
1109
1110
1111 # Cross-platform file locking
1112 if sys.platform == 'win32':
1113     import ctypes.wintypes
1114     import msvcrt
1115
1116     class OVERLAPPED(ctypes.Structure):
1117         _fields_ = [
1118             ('Internal', ctypes.wintypes.LPVOID),
1119             ('InternalHigh', ctypes.wintypes.LPVOID),
1120             ('Offset', ctypes.wintypes.DWORD),
1121             ('OffsetHigh', ctypes.wintypes.DWORD),
1122             ('hEvent', ctypes.wintypes.HANDLE),
1123         ]
1124
1125     kernel32 = ctypes.windll.kernel32
1126     LockFileEx = kernel32.LockFileEx
1127     LockFileEx.argtypes = [
1128         ctypes.wintypes.HANDLE,     # hFile
1129         ctypes.wintypes.DWORD,      # dwFlags
1130         ctypes.wintypes.DWORD,      # dwReserved
1131         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1132         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1133         ctypes.POINTER(OVERLAPPED)  # Overlapped
1134     ]
1135     LockFileEx.restype = ctypes.wintypes.BOOL
1136     UnlockFileEx = kernel32.UnlockFileEx
1137     UnlockFileEx.argtypes = [
1138         ctypes.wintypes.HANDLE,     # hFile
1139         ctypes.wintypes.DWORD,      # dwReserved
1140         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
1141         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
1142         ctypes.POINTER(OVERLAPPED)  # Overlapped
1143     ]
1144     UnlockFileEx.restype = ctypes.wintypes.BOOL
1145     whole_low = 0xffffffff
1146     whole_high = 0x7fffffff
1147
1148     def _lock_file(f, exclusive):
1149         overlapped = OVERLAPPED()
1150         overlapped.Offset = 0
1151         overlapped.OffsetHigh = 0
1152         overlapped.hEvent = 0
1153         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1154         handle = msvcrt.get_osfhandle(f.fileno())
1155         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1156                           whole_low, whole_high, f._lock_file_overlapped_p):
1157             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1158
1159     def _unlock_file(f):
1160         assert f._lock_file_overlapped_p
1161         handle = msvcrt.get_osfhandle(f.fileno())
1162         if not UnlockFileEx(handle, 0,
1163                             whole_low, whole_high, f._lock_file_overlapped_p):
1164             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1165
1166 else:
1167     import fcntl
1168
1169     def _lock_file(f, exclusive):
1170         fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1171
1172     def _unlock_file(f):
1173         fcntl.flock(f, fcntl.LOCK_UN)
1174
1175
1176 class locked_file(object):
1177     def __init__(self, filename, mode, encoding=None):
1178         assert mode in ['r', 'a', 'w']
1179         self.f = io.open(filename, mode, encoding=encoding)
1180         self.mode = mode
1181
1182     def __enter__(self):
1183         exclusive = self.mode != 'r'
1184         try:
1185             _lock_file(self.f, exclusive)
1186         except IOError:
1187             self.f.close()
1188             raise
1189         return self
1190
1191     def __exit__(self, etype, value, traceback):
1192         try:
1193             _unlock_file(self.f)
1194         finally:
1195             self.f.close()
1196
1197     def __iter__(self):
1198         return iter(self.f)
1199
1200     def write(self, *args):
1201         return self.f.write(*args)
1202
1203     def read(self, *args):
1204         return self.f.read(*args)
1205
1206
1207 def shell_quote(args):
1208     quoted_args = []
1209     encoding = sys.getfilesystemencoding()
1210     if encoding is None:
1211         encoding = 'utf-8'
1212     for a in args:
1213         if isinstance(a, bytes):
1214             # We may get a filename encoded with 'encodeFilename'
1215             a = a.decode(encoding)
1216         quoted_args.append(pipes.quote(a))
1217     return u' '.join(quoted_args)
1218
1219
1220 def takewhile_inclusive(pred, seq):
1221     """ Like itertools.takewhile, but include the latest evaluated element
1222         (the first element so that Not pred(e)) """
1223     for e in seq:
1224         yield e
1225         if not pred(e):
1226             return
1227
1228
1229 def smuggle_url(url, data):
1230     """ Pass additional data in a URL for internal use. """
1231
1232     sdata = compat_urllib_parse.urlencode(
1233         {u'__youtubedl_smuggle': json.dumps(data)})
1234     return url + u'#' + sdata
1235
1236
1237 def unsmuggle_url(smug_url, default=None):
1238     if not '#__youtubedl_smuggle' in smug_url:
1239         return smug_url, default
1240     url, _, sdata = smug_url.rpartition(u'#')
1241     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1242     data = json.loads(jsond)
1243     return url, data
1244
1245
1246 def format_bytes(bytes):
1247     if bytes is None:
1248         return u'N/A'
1249     if type(bytes) is str:
1250         bytes = float(bytes)
1251     if bytes == 0.0:
1252         exponent = 0
1253     else:
1254         exponent = int(math.log(bytes, 1024.0))
1255     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1256     converted = float(bytes) / float(1024 ** exponent)
1257     return u'%.2f%s' % (converted, suffix)
1258
1259
1260 def get_term_width():
1261     columns = os.environ.get('COLUMNS', None)
1262     if columns:
1263         return int(columns)
1264
1265     try:
1266         sp = subprocess.Popen(
1267             ['stty', 'size'],
1268             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1269         out, err = sp.communicate()
1270         return int(out.split()[1])
1271     except:
1272         pass
1273     return None
1274
1275
1276 def month_by_name(name):
1277     """ Return the number of a month by (locale-independently) English name """
1278
1279     ENGLISH_NAMES = [
1280         u'January', u'February', u'March', u'April', u'May', u'June',
1281         u'July', u'August', u'September', u'October', u'November', u'December']
1282     try:
1283         return ENGLISH_NAMES.index(name) + 1
1284     except ValueError:
1285         return None
1286
1287
1288 def fix_xml_ampersands(xml_str):
1289     """Replace all the '&' by '&amp;' in XML"""
1290     return re.sub(
1291         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1292         u'&amp;',
1293         xml_str)
1294
1295
1296 def setproctitle(title):
1297     assert isinstance(title, compat_str)
1298     try:
1299         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1300     except OSError:
1301         return
1302     title_bytes = title.encode('utf-8')
1303     buf = ctypes.create_string_buffer(len(title_bytes))
1304     buf.value = title_bytes
1305     try:
1306         libc.prctl(15, buf, 0, 0, 0)
1307     except AttributeError:
1308         return  # Strange libc, just skip this
1309
1310
1311 def remove_start(s, start):
1312     if s.startswith(start):
1313         return s[len(start):]
1314     return s
1315
1316
1317 def remove_end(s, end):
1318     if s.endswith(end):
1319         return s[:-len(end)]
1320     return s
1321
1322
1323 def url_basename(url):
1324     path = compat_urlparse.urlparse(url).path
1325     return path.strip(u'/').split(u'/')[-1]
1326
1327
1328 class HEADRequest(compat_urllib_request.Request):
1329     def get_method(self):
1330         return "HEAD"
1331
1332
1333 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1334     if get_attr:
1335         if v is not None:
1336             v = getattr(v, get_attr, None)
1337     if v == '':
1338         v = None
1339     return default if v is None else (int(v) * invscale // scale)
1340
1341
1342 def str_or_none(v, default=None):
1343     return default if v is None else compat_str(v)
1344
1345
1346 def str_to_int(int_str):
1347     """ A more relaxed version of int_or_none """
1348     if int_str is None:
1349         return None
1350     int_str = re.sub(r'[,\.\+]', u'', int_str)
1351     return int(int_str)
1352
1353
1354 def float_or_none(v, scale=1, invscale=1, default=None):
1355     return default if v is None else (float(v) * invscale / scale)
1356
1357
1358 def parse_duration(s):
1359     if s is None:
1360         return None
1361
1362     s = s.strip()
1363
1364     m = re.match(
1365         r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1366     if not m:
1367         return None
1368     res = int(m.group('secs'))
1369     if m.group('mins'):
1370         res += int(m.group('mins')) * 60
1371         if m.group('hours'):
1372             res += int(m.group('hours')) * 60 * 60
1373     if m.group('ms'):
1374         res += float(m.group('ms'))
1375     return res
1376
1377
1378 def prepend_extension(filename, ext):
1379     name, real_ext = os.path.splitext(filename)
1380     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1381
1382
1383 def check_executable(exe, args=[]):
1384     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1385     args can be a list of arguments for a short output (like -version) """
1386     try:
1387         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1388     except OSError:
1389         return False
1390     return exe
1391
1392
1393 class PagedList(object):
1394     def __len__(self):
1395         # This is only useful for tests
1396         return len(self.getslice())
1397
1398
1399 class OnDemandPagedList(PagedList):
1400     def __init__(self, pagefunc, pagesize):
1401         self._pagefunc = pagefunc
1402         self._pagesize = pagesize
1403
1404     def getslice(self, start=0, end=None):
1405         res = []
1406         for pagenum in itertools.count(start // self._pagesize):
1407             firstid = pagenum * self._pagesize
1408             nextfirstid = pagenum * self._pagesize + self._pagesize
1409             if start >= nextfirstid:
1410                 continue
1411
1412             page_results = list(self._pagefunc(pagenum))
1413
1414             startv = (
1415                 start % self._pagesize
1416                 if firstid <= start < nextfirstid
1417                 else 0)
1418
1419             endv = (
1420                 ((end - 1) % self._pagesize) + 1
1421                 if (end is not None and firstid <= end <= nextfirstid)
1422                 else None)
1423
1424             if startv != 0 or endv is not None:
1425                 page_results = page_results[startv:endv]
1426             res.extend(page_results)
1427
1428             # A little optimization - if current page is not "full", ie. does
1429             # not contain page_size videos then we can assume that this page
1430             # is the last one - there are no more ids on further pages -
1431             # i.e. no need to query again.
1432             if len(page_results) + startv < self._pagesize:
1433                 break
1434
1435             # If we got the whole page, but the next page is not interesting,
1436             # break out early as well
1437             if end == nextfirstid:
1438                 break
1439         return res
1440
1441
1442 class InAdvancePagedList(PagedList):
1443     def __init__(self, pagefunc, pagecount, pagesize):
1444         self._pagefunc = pagefunc
1445         self._pagecount = pagecount
1446         self._pagesize = pagesize
1447
1448     def getslice(self, start=0, end=None):
1449         res = []
1450         start_page = start // self._pagesize
1451         end_page = (
1452             self._pagecount if end is None else (end // self._pagesize + 1))
1453         skip_elems = start - start_page * self._pagesize
1454         only_more = None if end is None else end - start
1455         for pagenum in range(start_page, end_page):
1456             page = list(self._pagefunc(pagenum))
1457             if skip_elems:
1458                 page = page[skip_elems:]
1459                 skip_elems = None
1460             if only_more is not None:
1461                 if len(page) < only_more:
1462                     only_more -= len(page)
1463                 else:
1464                     page = page[:only_more]
1465                     res.extend(page)
1466                     break
1467             res.extend(page)
1468         return res
1469
1470
1471 def uppercase_escape(s):
1472     unicode_escape = codecs.getdecoder('unicode_escape')
1473     return re.sub(
1474         r'\\U[0-9a-fA-F]{8}',
1475         lambda m: unicode_escape(m.group(0))[0],
1476         s)
1477
1478
1479 def escape_rfc3986(s):
1480     """Escape non-ASCII characters as suggested by RFC 3986"""
1481     if sys.version_info < (3, 0) and isinstance(s, unicode):
1482         s = s.encode('utf-8')
1483     return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1484
1485
1486 def escape_url(url):
1487     """Escape URL as suggested by RFC 3986"""
1488     url_parsed = compat_urllib_parse_urlparse(url)
1489     return url_parsed._replace(
1490         path=escape_rfc3986(url_parsed.path),
1491         params=escape_rfc3986(url_parsed.params),
1492         query=escape_rfc3986(url_parsed.query),
1493         fragment=escape_rfc3986(url_parsed.fragment)
1494     ).geturl()
1495
1496 try:
1497     struct.pack(u'!I', 0)
1498 except TypeError:
1499     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1500     def struct_pack(spec, *args):
1501         if isinstance(spec, compat_str):
1502             spec = spec.encode('ascii')
1503         return struct.pack(spec, *args)
1504
1505     def struct_unpack(spec, *args):
1506         if isinstance(spec, compat_str):
1507             spec = spec.encode('ascii')
1508         return struct.unpack(spec, *args)
1509 else:
1510     struct_pack = struct.pack
1511     struct_unpack = struct.unpack
1512
1513
1514 def read_batch_urls(batch_fd):
1515     def fixup(url):
1516         if not isinstance(url, compat_str):
1517             url = url.decode('utf-8', 'replace')
1518         BOM_UTF8 = u'\xef\xbb\xbf'
1519         if url.startswith(BOM_UTF8):
1520             url = url[len(BOM_UTF8):]
1521         url = url.strip()
1522         if url.startswith(('#', ';', ']')):
1523             return False
1524         return url
1525
1526     with contextlib.closing(batch_fd) as fd:
1527         return [url for url in map(fixup, fd) if url]
1528
1529
1530 def urlencode_postdata(*args, **kargs):
1531     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1532
1533
1534 try:
1535     etree_iter = xml.etree.ElementTree.Element.iter
1536 except AttributeError:  # Python <=2.6
1537     etree_iter = lambda n: n.findall('.//*')
1538
1539
1540 def parse_xml(s):
1541     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1542         def doctype(self, name, pubid, system):
1543             pass  # Ignore doctypes
1544
1545     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1546     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1547     tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1548     # Fix up XML parser in Python 2.x
1549     if sys.version_info < (3, 0):
1550         for n in etree_iter(tree):
1551             if n.text is not None:
1552                 if not isinstance(n.text, compat_str):
1553                     n.text = n.text.decode('utf-8')
1554     return tree
1555
1556
1557 if sys.version_info < (3, 0) and sys.platform == 'win32':
1558     def compat_getpass(prompt, *args, **kwargs):
1559         if isinstance(prompt, compat_str):
1560             prompt = prompt.encode(preferredencoding())
1561         return getpass.getpass(prompt, *args, **kwargs)
1562 else:
1563     compat_getpass = getpass.getpass
1564
1565
1566 US_RATINGS = {
1567     'G': 0,
1568     'PG': 10,
1569     'PG-13': 13,
1570     'R': 16,
1571     'NC': 18,
1572 }
1573
1574
1575 def strip_jsonp(code):
1576     return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1577
1578
1579 def js_to_json(code):
1580     def fix_kv(m):
1581         key = m.group(2)
1582         if key.startswith("'"):
1583             assert key.endswith("'")
1584             assert '"' not in key
1585             key = '"%s"' % key[1:-1]
1586         elif not key.startswith('"'):
1587             key = '"%s"' % key
1588
1589         value = m.group(4)
1590         if value.startswith("'"):
1591             assert value.endswith("'")
1592             assert '"' not in value
1593             value = '"%s"' % value[1:-1]
1594
1595         return m.group(1) + key + m.group(3) + value
1596
1597     res = re.sub(r'''(?x)
1598             ([{,]\s*)
1599             ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1600             (:\s*)
1601             ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1602         ''', fix_kv, code)
1603     res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1604     return res
1605
1606
1607 def qualities(quality_ids):
1608     """ Get a numeric quality value out of a list of possible values """
1609     def q(qid):
1610         try:
1611             return quality_ids.index(qid)
1612         except ValueError:
1613             return -1
1614     return q
1615
1616
1617 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1618
1619 try:
1620     subprocess_check_output = subprocess.check_output
1621 except AttributeError:
1622     def subprocess_check_output(*args, **kwargs):
1623         assert 'input' not in kwargs
1624         p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1625         output, _ = p.communicate()
1626         ret = p.poll()
1627         if ret:
1628             raise subprocess.CalledProcessError(ret, p.args, output=output)
1629         return output
1630
1631
1632 def limit_length(s, length):
1633     """ Add ellipses to overly long strings """
1634     if s is None:
1635         return None
1636     ELLIPSES = '...'
1637     if len(s) > length:
1638         return s[:length - len(ELLIPSES)] + ELLIPSES
1639     return s