youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import contextlib
   5 import ctypes
   6 import datetime
   7 import email.utils
   8 import errno
   9 import getpass
  10 import gzip
  11 import itertools
  12 import io
  13 import json
  14 import locale
  15 import math
  16 import os
  17 import pipes
  18 import platform
  19 import re
  20 import ssl
  21 import socket
  22 import struct
  23 import subprocess
  24 import sys
  25 import traceback
  26 import xml.etree.ElementTree
  27 import zlib
  28
  29 try:
  30     import urllib.request as compat_urllib_request
  31 except ImportError: # Python 2
  32     import urllib2 as compat_urllib_request
  33
  34 try:
  35     import urllib.error as compat_urllib_error
  36 except ImportError: # Python 2
  37     import urllib2 as compat_urllib_error
  38
  39 try:
  40     import urllib.parse as compat_urllib_parse
  41 except ImportError: # Python 2
  42     import urllib as compat_urllib_parse
  43
  44 try:
  45     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  46 except ImportError: # Python 2
  47     from urlparse import urlparse as compat_urllib_parse_urlparse
  48
  49 try:
  50     import urllib.parse as compat_urlparse
  51 except ImportError: # Python 2
  52     import urlparse as compat_urlparse
  53
  54 try:
  55     import http.cookiejar as compat_cookiejar
  56 except ImportError: # Python 2
  57     import cookielib as compat_cookiejar
  58
  59 try:
  60     import html.entities as compat_html_entities
  61 except ImportError: # Python 2
  62     import htmlentitydefs as compat_html_entities
  63
  64 try:
  65     import html.parser as compat_html_parser
  66 except ImportError: # Python 2
  67     import HTMLParser as compat_html_parser
  68
  69 try:
  70     import http.client as compat_http_client
  71 except ImportError: # Python 2
  72     import httplib as compat_http_client
  73
  74 try:
  75     from urllib.error import HTTPError as compat_HTTPError
  76 except ImportError:  # Python 2
  77     from urllib2 import HTTPError as compat_HTTPError
  78
  79 try:
  80     from urllib.request import urlretrieve as compat_urlretrieve
  81 except ImportError:  # Python 2
  82     from urllib import urlretrieve as compat_urlretrieve
  83
  84
  85 try:
  86     from subprocess import DEVNULL
  87     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  88 except ImportError:
  89     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  90
  91 try:
  92     from urllib.parse import parse_qs as compat_parse_qs
  93 except ImportError: # Python 2
  94     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  95     # Python 2's version is apparently totally broken
  96     def _unquote(string, encoding='utf-8', errors='replace'):
  97         if string == '':
  98             return string
  99         res = string.split('%')
 100         if len(res) == 1:
 101             return string
 102         if encoding is None:
 103             encoding = 'utf-8'
 104         if errors is None:
 105             errors = 'replace'
 106         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 107         pct_sequence = b''
 108         string = res[0]
 109         for item in res[1:]:
 110             try:
 111                 if not item:
 112                     raise ValueError
 113                 pct_sequence += item[:2].decode('hex')
 114                 rest = item[2:]
 115                 if not rest:
 116                     # This segment was just a single percent-encoded character.
 117                     # May be part of a sequence of code units, so delay decoding.
 118                     # (Stored in pct_sequence).
 119                     continue
 120             except ValueError:
 121                 rest = '%' + item
 122             # Encountered non-percent-encoded characters. Flush the current
 123             # pct_sequence.
 124             string += pct_sequence.decode(encoding, errors) + rest
 125             pct_sequence = b''
 126         if pct_sequence:
 127             # Flush the final pct_sequence
 128             string += pct_sequence.decode(encoding, errors)
 129         return string
 130
 131     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 132                 encoding='utf-8', errors='replace'):
 133         qs, _coerce_result = qs, unicode
 134         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 135         r = []
 136         for name_value in pairs:
 137             if not name_value and not strict_parsing:
 138                 continue
 139             nv = name_value.split('=', 1)
 140             if len(nv) != 2:
 141                 if strict_parsing:
 142                     raise ValueError("bad query field: %r" % (name_value,))
 143                 # Handle case of a control-name with no equal sign
 144                 if keep_blank_values:
 145                     nv.append('')
 146                 else:
 147                     continue
 148             if len(nv[1]) or keep_blank_values:
 149                 name = nv[0].replace('+', ' ')
 150                 name = _unquote(name, encoding=encoding, errors=errors)
 151                 name = _coerce_result(name)
 152                 value = nv[1].replace('+', ' ')
 153                 value = _unquote(value, encoding=encoding, errors=errors)
 154                 value = _coerce_result(value)
 155                 r.append((name, value))
 156         return r
 157
 158     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 159                 encoding='utf-8', errors='replace'):
 160         parsed_result = {}
 161         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 162                         encoding=encoding, errors=errors)
 163         for name, value in pairs:
 164             if name in parsed_result:
 165                 parsed_result[name].append(value)
 166             else:
 167                 parsed_result[name] = [value]
 168         return parsed_result
 169
 170 try:
 171     compat_str = unicode # Python 2
 172 except NameError:
 173     compat_str = str
 174
 175 try:
 176     compat_chr = unichr # Python 2
 177 except NameError:
 178     compat_chr = chr
 179
 180 try:
 181     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 182 except ImportError:  # Python 2.6
 183     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 184
 185 def compat_ord(c):
 186     if type(c) is int: return c
 187     else: return ord(c)
 188
 189 # This is not clearly defined otherwise
 190 compiled_regex_type = type(re.compile(''))
 191
 192 std_headers = {
 193     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 194     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 195     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 196     'Accept-Encoding': 'gzip, deflate',
 197     'Accept-Language': 'en-us,en;q=0.5',
 198 }
 199
 200 def preferredencoding():
 201     """Get preferred encoding.
 202
 203     Returns the best encoding scheme for the system, based on
 204     locale.getpreferredencoding() and some further tweaks.
 205     """
 206     try:
 207         pref = locale.getpreferredencoding()
 208         u'TEST'.encode(pref)
 209     except:
 210         pref = 'UTF-8'
 211
 212     return pref
 213
 214 if sys.version_info < (3,0):
 215     def compat_print(s):
 216         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 217 else:
 218     def compat_print(s):
 219         assert type(s) == type(u'')
 220         print(s)
 221
 222 # In Python 2.x, json.dump expects a bytestream.
 223 # In Python 3.x, it writes to a character stream
 224 if sys.version_info < (3,0):
 225     def write_json_file(obj, fn):
 226         with open(fn, 'wb') as f:
 227             json.dump(obj, f)
 228 else:
 229     def write_json_file(obj, fn):
 230         with open(fn, 'w', encoding='utf-8') as f:
 231             json.dump(obj, f)
 232
 233 if sys.version_info >= (2,7):
 234     def find_xpath_attr(node, xpath, key, val):
 235         """ Find the xpath xpath[@key=val] """
 236         assert re.match(r'^[a-zA-Z]+$', key)
 237         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 238         expr = xpath + u"[@%s='%s']" % (key, val)
 239         return node.find(expr)
 240 else:
 241     def find_xpath_attr(node, xpath, key, val):
 242         for f in node.findall(xpath):
 243             if f.attrib.get(key) == val:
 244                 return f
 245         return None
 246
 247 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 248 # the namespace parameter
 249 def xpath_with_ns(path, ns_map):
 250     components = [c.split(':') for c in path.split('/')]
 251     replaced = []
 252     for c in components:
 253         if len(c) == 1:
 254             replaced.append(c[0])
 255         else:
 256             ns, tag = c
 257             replaced.append('{%s}%s' % (ns_map[ns], tag))
 258     return '/'.join(replaced)
 259
 260 def htmlentity_transform(matchobj):
 261     """Transforms an HTML entity to a character.
 262
 263     This function receives a match object and is intended to be used with
 264     the re.sub() function.
 265     """
 266     entity = matchobj.group(1)
 267
 268     # Known non-numeric HTML entity
 269     if entity in compat_html_entities.name2codepoint:
 270         return compat_chr(compat_html_entities.name2codepoint[entity])
 271
 272     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 273     if mobj is not None:
 274         numstr = mobj.group(1)
 275         if numstr.startswith(u'x'):
 276             base = 16
 277             numstr = u'0%s' % numstr
 278         else:
 279             base = 10
 280         return compat_chr(int(numstr, base))
 281
 282     # Unknown entity in name, return its literal representation
 283     return (u'&%s;' % entity)
 284
 285 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 286 class BaseHTMLParser(compat_html_parser.HTMLParser):
 287     def __init(self):
 288         compat_html_parser.HTMLParser.__init__(self)
 289         self.html = None
 290
 291     def loads(self, html):
 292         self.html = html
 293         self.feed(html)
 294         self.close()
 295
 296 class AttrParser(BaseHTMLParser):
 297     """Modified HTMLParser that isolates a tag with the specified attribute"""
 298     def __init__(self, attribute, value):
 299         self.attribute = attribute
 300         self.value = value
 301         self.result = None
 302         self.started = False
 303         self.depth = {}
 304         self.watch_startpos = False
 305         self.error_count = 0
 306         BaseHTMLParser.__init__(self)
 307
 308     def error(self, message):
 309         if self.error_count > 10 or self.started:
 310             raise compat_html_parser.HTMLParseError(message, self.getpos())
 311         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 312         self.error_count += 1
 313         self.goahead(1)
 314
 315     def handle_starttag(self, tag, attrs):
 316         attrs = dict(attrs)
 317         if self.started:
 318             self.find_startpos(None)
 319         if self.attribute in attrs and attrs[self.attribute] == self.value:
 320             self.result = [tag]
 321             self.started = True
 322             self.watch_startpos = True
 323         if self.started:
 324             if not tag in self.depth: self.depth[tag] = 0
 325             self.depth[tag] += 1
 326
 327     def handle_endtag(self, tag):
 328         if self.started:
 329             if tag in self.depth: self.depth[tag] -= 1
 330             if self.depth[self.result[0]] == 0:
 331                 self.started = False
 332                 self.result.append(self.getpos())
 333
 334     def find_startpos(self, x):
 335         """Needed to put the start position of the result (self.result[1])
 336         after the opening tag with the requested id"""
 337         if self.watch_startpos:
 338             self.watch_startpos = False
 339             self.result.append(self.getpos())
 340     handle_entityref = handle_charref = handle_data = handle_comment = \
 341     handle_decl = handle_pi = unknown_decl = find_startpos
 342
 343     def get_result(self):
 344         if self.result is None:
 345             return None
 346         if len(self.result) != 3:
 347             return None
 348         lines = self.html.split('\n')
 349         lines = lines[self.result[1][0]-1:self.result[2][0]]
 350         lines[0] = lines[0][self.result[1][1]:]
 351         if len(lines) == 1:
 352             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 353         lines[-1] = lines[-1][:self.result[2][1]]
 354         return '\n'.join(lines).strip()
 355 # Hack for https://github.com/rg3/youtube-dl/issues/662
 356 if sys.version_info < (2, 7, 3):
 357     AttrParser.parse_endtag = (lambda self, i:
 358         i + len("</scr'+'ipt>")
 359         if self.rawdata[i:].startswith("</scr'+'ipt>")
 360         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 361
 362 def get_element_by_id(id, html):
 363     """Return the content of the tag with the specified ID in the passed HTML document"""
 364     return get_element_by_attribute("id", id, html)
 365
 366 def get_element_by_attribute(attribute, value, html):
 367     """Return the content of the tag with the specified attribute in the passed HTML document"""
 368     parser = AttrParser(attribute, value)
 369     try:
 370         parser.loads(html)
 371     except compat_html_parser.HTMLParseError:
 372         pass
 373     return parser.get_result()
 374
 375 class MetaParser(BaseHTMLParser):
 376     """
 377     Modified HTMLParser that isolates a meta tag with the specified name
 378     attribute.
 379     """
 380     def __init__(self, name):
 381         BaseHTMLParser.__init__(self)
 382         self.name = name
 383         self.content = None
 384         self.result = None
 385
 386     def handle_starttag(self, tag, attrs):
 387         if tag != 'meta':
 388             return
 389         attrs = dict(attrs)
 390         if attrs.get('name') == self.name:
 391             self.result = attrs.get('content')
 392
 393     def get_result(self):
 394         return self.result
 395
 396 def get_meta_content(name, html):
 397     """
 398     Return the content attribute from the meta tag with the given name attribute.
 399     """
 400     parser = MetaParser(name)
 401     try:
 402         parser.loads(html)
 403     except compat_html_parser.HTMLParseError:
 404         pass
 405     return parser.get_result()
 406
 407
 408 def clean_html(html):
 409     """Clean an HTML snippet into a readable string"""
 410     # Newline vs <br />
 411     html = html.replace('\n', ' ')
 412     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 413     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 414     # Strip html tags
 415     html = re.sub('<.*?>', '', html)
 416     # Replace html entities
 417     html = unescapeHTML(html)
 418     return html.strip()
 419
 420
 421 def sanitize_open(filename, open_mode):
 422     """Try to open the given filename, and slightly tweak it if this fails.
 423
 424     Attempts to open the given filename. If this fails, it tries to change
 425     the filename slightly, step by step, until it's either able to open it
 426     or it fails and raises a final exception, like the standard open()
 427     function.
 428
 429     It returns the tuple (stream, definitive_file_name).
 430     """
 431     try:
 432         if filename == u'-':
 433             if sys.platform == 'win32':
 434                 import msvcrt
 435                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 436             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 437         stream = open(encodeFilename(filename), open_mode)
 438         return (stream, filename)
 439     except (IOError, OSError) as err:
 440         if err.errno in (errno.EACCES,):
 441             raise
 442
 443         # In case of error, try to remove win32 forbidden chars
 444         alt_filename = os.path.join(
 445                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 446                         for path_part in os.path.split(filename)
 447                        )
 448         if alt_filename == filename:
 449             raise
 450         else:
 451             # An exception here should be caught in the caller
 452             stream = open(encodeFilename(filename), open_mode)
 453             return (stream, alt_filename)
 454
 455
 456 def timeconvert(timestr):
 457     """Convert RFC 2822 defined time string into system timestamp"""
 458     timestamp = None
 459     timetuple = email.utils.parsedate_tz(timestr)
 460     if timetuple is not None:
 461         timestamp = email.utils.mktime_tz(timetuple)
 462     return timestamp
 463
 464 def sanitize_filename(s, restricted=False, is_id=False):
 465     """Sanitizes a string so it could be used as part of a filename.
 466     If restricted is set, use a stricter subset of allowed characters.
 467     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 468     """
 469     def replace_insane(char):
 470         if char == '?' or ord(char) < 32 or ord(char) == 127:
 471             return ''
 472         elif char == '"':
 473             return '' if restricted else '\''
 474         elif char == ':':
 475             return '_-' if restricted else ' -'
 476         elif char in '\\/|*<>':
 477             return '_'
 478         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 479             return '_'
 480         if restricted and ord(char) > 127:
 481             return '_'
 482         return char
 483
 484     result = u''.join(map(replace_insane, s))
 485     if not is_id:
 486         while '__' in result:
 487             result = result.replace('__', '_')
 488         result = result.strip('_')
 489         # Common case of "Foreign band name - English song title"
 490         if restricted and result.startswith('-_'):
 491             result = result[2:]
 492         if not result:
 493             result = '_'
 494     return result
 495
 496 def orderedSet(iterable):
 497     """ Remove all duplicates from the input iterable """
 498     res = []
 499     for el in iterable:
 500         if el not in res:
 501             res.append(el)
 502     return res
 503
 504 def unescapeHTML(s):
 505     """
 506     @param s a string
 507     """
 508     assert type(s) == type(u'')
 509
 510     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 511     return result
 512
 513
 514 def encodeFilename(s, for_subprocess=False):
 515     """
 516     @param s The name of the file
 517     """
 518
 519     assert type(s) == compat_str
 520
 521     # Python 3 has a Unicode API
 522     if sys.version_info >= (3, 0):
 523         return s
 524
 525     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 526         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 527         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 528         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 529         if not for_subprocess:
 530             return s
 531         else:
 532             # For subprocess calls, encode with locale encoding
 533             # Refer to http://stackoverflow.com/a/9951851/35070
 534             encoding = preferredencoding()
 535     else:
 536         encoding = sys.getfilesystemencoding()
 537     if encoding is None:
 538         encoding = 'utf-8'
 539     return s.encode(encoding, 'ignore')
 540
 541
 542 def decodeOption(optval):
 543     if optval is None:
 544         return optval
 545     if isinstance(optval, bytes):
 546         optval = optval.decode(preferredencoding())
 547
 548     assert isinstance(optval, compat_str)
 549     return optval
 550
 551 def formatSeconds(secs):
 552     if secs > 3600:
 553         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 554     elif secs > 60:
 555         return '%d:%02d' % (secs // 60, secs % 60)
 556     else:
 557         return '%d' % secs
 558
 559
 560 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 561     if sys.version_info < (3, 2):
 562         import httplib
 563
 564         class HTTPSConnectionV3(httplib.HTTPSConnection):
 565             def __init__(self, *args, **kwargs):
 566                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 567
 568             def connect(self):
 569                 sock = socket.create_connection((self.host, self.port), self.timeout)
 570                 if getattr(self, '_tunnel_host', False):
 571                     self.sock = sock
 572                     self._tunnel()
 573                 try:
 574                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 575                 except ssl.SSLError:
 576                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 577
 578         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 579             def https_open(self, req):
 580                 return self.do_open(HTTPSConnectionV3, req)
 581         return HTTPSHandlerV3(**kwargs)
 582     else:
 583         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 584         context.verify_mode = (ssl.CERT_NONE
 585                                if opts_no_check_certificate
 586                                else ssl.CERT_REQUIRED)
 587         context.set_default_verify_paths()
 588         try:
 589             context.load_default_certs()
 590         except AttributeError:
 591             pass  # Python < 3.4
 592         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 593
 594 class ExtractorError(Exception):
 595     """Error during info extraction."""
 596     def __init__(self, msg, tb=None, expected=False, cause=None):
 597         """ tb, if given, is the original traceback (so that it can be printed out).
 598         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 599         """
 600
 601         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 602             expected = True
 603         if not expected:
 604             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 605         super(ExtractorError, self).__init__(msg)
 606
 607         self.traceback = tb
 608         self.exc_info = sys.exc_info()  # preserve original exception
 609         self.cause = cause
 610
 611     def format_traceback(self):
 612         if self.traceback is None:
 613             return None
 614         return u''.join(traceback.format_tb(self.traceback))
 615
 616
 617 class RegexNotFoundError(ExtractorError):
 618     """Error when a regex didn't match"""
 619     pass
 620
 621
 622 class DownloadError(Exception):
 623     """Download Error exception.
 624
 625     This exception may be thrown by FileDownloader objects if they are not
 626     configured to continue on errors. They will contain the appropriate
 627     error message.
 628     """
 629     def __init__(self, msg, exc_info=None):
 630         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 631         super(DownloadError, self).__init__(msg)
 632         self.exc_info = exc_info
 633
 634
 635 class SameFileError(Exception):
 636     """Same File exception.
 637
 638     This exception will be thrown by FileDownloader objects if they detect
 639     multiple files would have to be downloaded to the same file on disk.
 640     """
 641     pass
 642
 643
 644 class PostProcessingError(Exception):
 645     """Post Processing exception.
 646
 647     This exception may be raised by PostProcessor's .run() method to
 648     indicate an error in the postprocessing task.
 649     """
 650     def __init__(self, msg):
 651         self.msg = msg
 652
 653 class MaxDownloadsReached(Exception):
 654     """ --max-downloads limit has been reached. """
 655     pass
 656
 657
 658 class UnavailableVideoError(Exception):
 659     """Unavailable Format exception.
 660
 661     This exception will be thrown when a video is requested
 662     in a format that is not available for that video.
 663     """
 664     pass
 665
 666
 667 class ContentTooShortError(Exception):
 668     """Content Too Short exception.
 669
 670     This exception may be raised by FileDownloader objects when a file they
 671     download is too small for what the server announced first, indicating
 672     the connection was probably interrupted.
 673     """
 674     # Both in bytes
 675     downloaded = None
 676     expected = None
 677
 678     def __init__(self, downloaded, expected):
 679         self.downloaded = downloaded
 680         self.expected = expected
 681
 682 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 683     """Handler for HTTP requests and responses.
 684
 685     This class, when installed with an OpenerDirector, automatically adds
 686     the standard headers to every HTTP request and handles gzipped and
 687     deflated responses from web servers. If compression is to be avoided in
 688     a particular request, the original request in the program code only has
 689     to include the HTTP header "Youtubedl-No-Compression", which will be
 690     removed before making the real request.
 691
 692     Part of this code was copied from:
 693
 694     http://techknack.net/python-urllib2-handlers/
 695
 696     Andrew Rowls, the author of that code, agreed to release it to the
 697     public domain.
 698     """
 699
 700     @staticmethod
 701     def deflate(data):
 702         try:
 703             return zlib.decompress(data, -zlib.MAX_WBITS)
 704         except zlib.error:
 705             return zlib.decompress(data)
 706
 707     @staticmethod
 708     def addinfourl_wrapper(stream, headers, url, code):
 709         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 710             return compat_urllib_request.addinfourl(stream, headers, url, code)
 711         ret = compat_urllib_request.addinfourl(stream, headers, url)
 712         ret.code = code
 713         return ret
 714
 715     def http_request(self, req):
 716         for h,v in std_headers.items():
 717             if h in req.headers:
 718                 del req.headers[h]
 719             req.add_header(h, v)
 720         if 'Youtubedl-no-compression' in req.headers:
 721             if 'Accept-encoding' in req.headers:
 722                 del req.headers['Accept-encoding']
 723             del req.headers['Youtubedl-no-compression']
 724         if 'Youtubedl-user-agent' in req.headers:
 725             if 'User-agent' in req.headers:
 726                 del req.headers['User-agent']
 727             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 728             del req.headers['Youtubedl-user-agent']
 729         return req
 730
 731     def http_response(self, req, resp):
 732         old_resp = resp
 733         # gzip
 734         if resp.headers.get('Content-encoding', '') == 'gzip':
 735             content = resp.read()
 736             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 737             try:
 738                 uncompressed = io.BytesIO(gz.read())
 739             except IOError as original_ioerror:
 740                 # There may be junk add the end of the file
 741                 # See http://stackoverflow.com/q/4928560/35070 for details
 742                 for i in range(1, 1024):
 743                     try:
 744                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 745                         uncompressed = io.BytesIO(gz.read())
 746                     except IOError:
 747                         continue
 748                     break
 749                 else:
 750                     raise original_ioerror
 751             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 752             resp.msg = old_resp.msg
 753         # deflate
 754         if resp.headers.get('Content-encoding', '') == 'deflate':
 755             gz = io.BytesIO(self.deflate(resp.read()))
 756             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 757             resp.msg = old_resp.msg
 758         return resp
 759
 760     https_request = http_request
 761     https_response = http_response
 762
 763
 764 def unified_strdate(date_str):
 765     """Return a string with the date in the format YYYYMMDD"""
 766     upload_date = None
 767     #Replace commas
 768     date_str = date_str.replace(',', ' ')
 769     # %z (UTC offset) is only supported in python>=3.2
 770     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 771     format_expressions = [
 772         '%d %B %Y',
 773         '%d %b %Y',
 774         '%B %d %Y',
 775         '%b %d %Y',
 776         '%Y-%m-%d',
 777         '%d.%m.%Y',
 778         '%d/%m/%Y',
 779         '%Y/%m/%d %H:%M:%S',
 780         '%Y-%m-%d %H:%M:%S',
 781         '%d.%m.%Y %H:%M',
 782         '%d.%m.%Y %H.%M',
 783         '%Y-%m-%dT%H:%M:%SZ',
 784         '%Y-%m-%dT%H:%M:%S.%fZ',
 785         '%Y-%m-%dT%H:%M:%S.%f0Z',
 786         '%Y-%m-%dT%H:%M:%S',
 787         '%Y-%m-%dT%H:%M:%S.%f',
 788         '%Y-%m-%dT%H:%M',
 789     ]
 790     for expression in format_expressions:
 791         try:
 792             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 793         except ValueError:
 794             pass
 795     if upload_date is None:
 796         timetuple = email.utils.parsedate_tz(date_str)
 797         if timetuple:
 798             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 799     return upload_date
 800
 801 def determine_ext(url, default_ext=u'unknown_video'):
 802     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 803     if re.match(r'^[A-Za-z0-9]+$', guess):
 804         return guess
 805     else:
 806         return default_ext
 807
 808 def subtitles_filename(filename, sub_lang, sub_format):
 809     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 810
 811 def date_from_str(date_str):
 812     """
 813     Return a datetime object from a string in the format YYYYMMDD or
 814     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 815     today = datetime.date.today()
 816     if date_str == 'now'or date_str == 'today':
 817         return today
 818     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 819     if match is not None:
 820         sign = match.group('sign')
 821         time = int(match.group('time'))
 822         if sign == '-':
 823             time = -time
 824         unit = match.group('unit')
 825         #A bad aproximation?
 826         if unit == 'month':
 827             unit = 'day'
 828             time *= 30
 829         elif unit == 'year':
 830             unit = 'day'
 831             time *= 365
 832         unit += 's'
 833         delta = datetime.timedelta(**{unit: time})
 834         return today + delta
 835     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 836
 837 def hyphenate_date(date_str):
 838     """
 839     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 840     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 841     if match is not None:
 842         return '-'.join(match.groups())
 843     else:
 844         return date_str
 845
 846 class DateRange(object):
 847     """Represents a time interval between two dates"""
 848     def __init__(self, start=None, end=None):
 849         """start and end must be strings in the format accepted by date"""
 850         if start is not None:
 851             self.start = date_from_str(start)
 852         else:
 853             self.start = datetime.datetime.min.date()
 854         if end is not None:
 855             self.end = date_from_str(end)
 856         else:
 857             self.end = datetime.datetime.max.date()
 858         if self.start > self.end:
 859             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 860     @classmethod
 861     def day(cls, day):
 862         """Returns a range that only contains the given day"""
 863         return cls(day,day)
 864     def __contains__(self, date):
 865         """Check if the date is in the range"""
 866         if not isinstance(date, datetime.date):
 867             date = date_from_str(date)
 868         return self.start <= date <= self.end
 869     def __str__(self):
 870         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 871
 872
 873 def platform_name():
 874     """ Returns the platform name as a compat_str """
 875     res = platform.platform()
 876     if isinstance(res, bytes):
 877         res = res.decode(preferredencoding())
 878
 879     assert isinstance(res, compat_str)
 880     return res
 881
 882
 883 def write_string(s, out=None):
 884     if out is None:
 885         out = sys.stderr
 886     assert type(s) == compat_str
 887
 888     if ('b' in getattr(out, 'mode', '') or
 889             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 890         s = s.encode(preferredencoding(), 'ignore')
 891     try:
 892         out.write(s)
 893     except UnicodeEncodeError:
 894         # In Windows shells, this can fail even when the codec is just charmap!?
 895         # See https://wiki.python.org/moin/PrintFails#Issue
 896         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 897             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 898             out.write(s)
 899         else:
 900             raise
 901
 902     out.flush()
 903
 904
 905 def bytes_to_intlist(bs):
 906     if not bs:
 907         return []
 908     if isinstance(bs[0], int):  # Python 3
 909         return list(bs)
 910     else:
 911         return [ord(c) for c in bs]
 912
 913
 914 def intlist_to_bytes(xs):
 915     if not xs:
 916         return b''
 917     if isinstance(chr(0), bytes):  # Python 2
 918         return ''.join([chr(x) for x in xs])
 919     else:
 920         return bytes(xs)
 921
 922
 923 def get_cachedir(params={}):
 924     cache_root = os.environ.get('XDG_CACHE_HOME',
 925                                 os.path.expanduser('~/.cache'))
 926     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 927
 928
 929 # Cross-platform file locking
 930 if sys.platform == 'win32':
 931     import ctypes.wintypes
 932     import msvcrt
 933
 934     class OVERLAPPED(ctypes.Structure):
 935         _fields_ = [
 936             ('Internal', ctypes.wintypes.LPVOID),
 937             ('InternalHigh', ctypes.wintypes.LPVOID),
 938             ('Offset', ctypes.wintypes.DWORD),
 939             ('OffsetHigh', ctypes.wintypes.DWORD),
 940             ('hEvent', ctypes.wintypes.HANDLE),
 941         ]
 942
 943     kernel32 = ctypes.windll.kernel32
 944     LockFileEx = kernel32.LockFileEx
 945     LockFileEx.argtypes = [
 946         ctypes.wintypes.HANDLE,     # hFile
 947         ctypes.wintypes.DWORD,      # dwFlags
 948         ctypes.wintypes.DWORD,      # dwReserved
 949         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 950         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 951         ctypes.POINTER(OVERLAPPED)  # Overlapped
 952     ]
 953     LockFileEx.restype = ctypes.wintypes.BOOL
 954     UnlockFileEx = kernel32.UnlockFileEx
 955     UnlockFileEx.argtypes = [
 956         ctypes.wintypes.HANDLE,     # hFile
 957         ctypes.wintypes.DWORD,      # dwReserved
 958         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 959         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 960         ctypes.POINTER(OVERLAPPED)  # Overlapped
 961     ]
 962     UnlockFileEx.restype = ctypes.wintypes.BOOL
 963     whole_low = 0xffffffff
 964     whole_high = 0x7fffffff
 965
 966     def _lock_file(f, exclusive):
 967         overlapped = OVERLAPPED()
 968         overlapped.Offset = 0
 969         overlapped.OffsetHigh = 0
 970         overlapped.hEvent = 0
 971         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 972         handle = msvcrt.get_osfhandle(f.fileno())
 973         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 974                           whole_low, whole_high, f._lock_file_overlapped_p):
 975             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 976
 977     def _unlock_file(f):
 978         assert f._lock_file_overlapped_p
 979         handle = msvcrt.get_osfhandle(f.fileno())
 980         if not UnlockFileEx(handle, 0,
 981                             whole_low, whole_high, f._lock_file_overlapped_p):
 982             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 983
 984 else:
 985     import fcntl
 986
 987     def _lock_file(f, exclusive):
 988         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 989
 990     def _unlock_file(f):
 991         fcntl.lockf(f, fcntl.LOCK_UN)
 992
 993
 994 class locked_file(object):
 995     def __init__(self, filename, mode, encoding=None):
 996         assert mode in ['r', 'a', 'w']
 997         self.f = io.open(filename, mode, encoding=encoding)
 998         self.mode = mode
 999
1000     def __enter__(self):
1001         exclusive = self.mode != 'r'
1002         try:
1003             _lock_file(self.f, exclusive)
1004         except IOError:
1005             self.f.close()
1006             raise
1007         return self
1008
1009     def __exit__(self, etype, value, traceback):
1010         try:
1011             _unlock_file(self.f)
1012         finally:
1013             self.f.close()
1014
1015     def __iter__(self):
1016         return iter(self.f)
1017
1018     def write(self, *args):
1019         return self.f.write(*args)
1020
1021     def read(self, *args):
1022         return self.f.read(*args)
1023
1024
1025 def shell_quote(args):
1026     quoted_args = []
1027     encoding = sys.getfilesystemencoding()
1028     if encoding is None:
1029         encoding = 'utf-8'
1030     for a in args:
1031         if isinstance(a, bytes):
1032             # We may get a filename encoded with 'encodeFilename'
1033             a = a.decode(encoding)
1034         quoted_args.append(pipes.quote(a))
1035     return u' '.join(quoted_args)
1036
1037
1038 def takewhile_inclusive(pred, seq):
1039     """ Like itertools.takewhile, but include the latest evaluated element
1040         (the first element so that Not pred(e)) """
1041     for e in seq:
1042         yield e
1043         if not pred(e):
1044             return
1045
1046
1047 def smuggle_url(url, data):
1048     """ Pass additional data in a URL for internal use. """
1049
1050     sdata = compat_urllib_parse.urlencode(
1051         {u'__youtubedl_smuggle': json.dumps(data)})
1052     return url + u'#' + sdata
1053
1054
1055 def unsmuggle_url(smug_url, default=None):
1056     if not '#__youtubedl_smuggle' in smug_url:
1057         return smug_url, default
1058     url, _, sdata = smug_url.rpartition(u'#')
1059     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1060     data = json.loads(jsond)
1061     return url, data
1062
1063
1064 def format_bytes(bytes):
1065     if bytes is None:
1066         return u'N/A'
1067     if type(bytes) is str:
1068         bytes = float(bytes)
1069     if bytes == 0.0:
1070         exponent = 0
1071     else:
1072         exponent = int(math.log(bytes, 1024.0))
1073     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1074     converted = float(bytes) / float(1024 ** exponent)
1075     return u'%.2f%s' % (converted, suffix)
1076
1077
1078 def str_to_int(int_str):
1079     int_str = re.sub(r'[,\.]', u'', int_str)
1080     return int(int_str)
1081
1082
1083 def get_term_width():
1084     columns = os.environ.get('COLUMNS', None)
1085     if columns:
1086         return int(columns)
1087
1088     try:
1089         sp = subprocess.Popen(
1090             ['stty', 'size'],
1091             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1092         out, err = sp.communicate()
1093         return int(out.split()[1])
1094     except:
1095         pass
1096     return None
1097
1098
1099 def month_by_name(name):
1100     """ Return the number of a month by (locale-independently) English name """
1101
1102     ENGLISH_NAMES = [
1103         u'January', u'February', u'March', u'April', u'May', u'June',
1104         u'July', u'August', u'September', u'October', u'November', u'December']
1105     try:
1106         return ENGLISH_NAMES.index(name) + 1
1107     except ValueError:
1108         return None
1109
1110
1111 def fix_xml_ampersands(xml_str):
1112     """Replace all the '&' by '&amp;' in XML"""
1113     return re.sub(
1114         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1115         u'&amp;',
1116         xml_str)
1117
1118
1119 def setproctitle(title):
1120     assert isinstance(title, compat_str)
1121     try:
1122         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1123     except OSError:
1124         return
1125     title = title
1126     buf = ctypes.create_string_buffer(len(title) + 1)
1127     buf.value = title.encode('utf-8')
1128     try:
1129         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1130     except AttributeError:
1131         return  # Strange libc, just skip this
1132
1133
1134 def remove_start(s, start):
1135     if s.startswith(start):
1136         return s[len(start):]
1137     return s
1138
1139
1140 def url_basename(url):
1141     path = compat_urlparse.urlparse(url).path
1142     return path.strip(u'/').split(u'/')[-1]
1143
1144
1145 class HEADRequest(compat_urllib_request.Request):
1146     def get_method(self):
1147         return "HEAD"
1148
1149
1150 def int_or_none(v, scale=1):
1151     return v if v is None else (int(v) // scale)
1152
1153
1154 def parse_duration(s):
1155     if s is None:
1156         return None
1157
1158     m = re.match(
1159         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1160     if not m:
1161         return None
1162     res = int(m.group('secs'))
1163     if m.group('mins'):
1164         res += int(m.group('mins')) * 60
1165         if m.group('hours'):
1166             res += int(m.group('hours')) * 60 * 60
1167     return res
1168
1169
1170 def prepend_extension(filename, ext):
1171     name, real_ext = os.path.splitext(filename)
1172     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1173
1174
1175 def check_executable(exe, args=[]):
1176     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1177     args can be a list of arguments for a short output (like -version) """
1178     try:
1179         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1180     except OSError:
1181         return False
1182     return exe
1183
1184
1185 class PagedList(object):
1186     def __init__(self, pagefunc, pagesize):
1187         self._pagefunc = pagefunc
1188         self._pagesize = pagesize
1189
1190     def __len__(self):
1191         # This is only useful for tests
1192         return len(self.getslice())
1193
1194     def getslice(self, start=0, end=None):
1195         res = []
1196         for pagenum in itertools.count(start // self._pagesize):
1197             firstid = pagenum * self._pagesize
1198             nextfirstid = pagenum * self._pagesize + self._pagesize
1199             if start >= nextfirstid:
1200                 continue
1201
1202             page_results = list(self._pagefunc(pagenum))
1203
1204             startv = (
1205                 start % self._pagesize
1206                 if firstid <= start < nextfirstid
1207                 else 0)
1208
1209             endv = (
1210                 ((end - 1) % self._pagesize) + 1
1211                 if (end is not None and firstid <= end <= nextfirstid)
1212                 else None)
1213
1214             if startv != 0 or endv is not None:
1215                 page_results = page_results[startv:endv]
1216             res.extend(page_results)
1217
1218             # A little optimization - if current page is not "full", ie. does
1219             # not contain page_size videos then we can assume that this page
1220             # is the last one - there are no more ids on further pages -
1221             # i.e. no need to query again.
1222             if len(page_results) + startv < self._pagesize:
1223                 break
1224
1225             # If we got the whole page, but the next page is not interesting,
1226             # break out early as well
1227             if end == nextfirstid:
1228                 break
1229         return res
1230
1231
1232 def uppercase_escape(s):
1233     return re.sub(
1234         r'\\U([0-9a-fA-F]{8})',
1235         lambda m: compat_chr(int(m.group(1), base=16)), s)
1236
1237 try:
1238     struct.pack(u'!I', 0)
1239 except TypeError:
1240     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1241     def struct_pack(spec, *args):
1242         if isinstance(spec, compat_str):
1243             spec = spec.encode('ascii')
1244         return struct.pack(spec, *args)
1245
1246     def struct_unpack(spec, *args):
1247         if isinstance(spec, compat_str):
1248             spec = spec.encode('ascii')
1249         return struct.unpack(spec, *args)
1250 else:
1251     struct_pack = struct.pack
1252     struct_unpack = struct.unpack
1253
1254
1255 def read_batch_urls(batch_fd):
1256     def fixup(url):
1257         if not isinstance(url, compat_str):
1258             url = url.decode('utf-8', 'replace')
1259         BOM_UTF8 = u'\xef\xbb\xbf'
1260         if url.startswith(BOM_UTF8):
1261             url = url[len(BOM_UTF8):]
1262         url = url.strip()
1263         if url.startswith(('#', ';', ']')):
1264             return False
1265         return url
1266
1267     with contextlib.closing(batch_fd) as fd:
1268         return [url for url in map(fixup, fd) if url]
1269
1270
1271 def urlencode_postdata(*args, **kargs):
1272     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1273
1274
1275 def parse_xml(s):
1276     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1277         def doctype(self, name, pubid, system):
1278             pass  # Ignore doctypes
1279
1280     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1281     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1282     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1283
1284
1285 if sys.version_info < (3, 0) and sys.platform == 'win32':
1286     def compat_getpass(prompt, *args, **kwargs):
1287         if isinstance(prompt, compat_str):
1288             prompt = prompt.encode(preferredencoding())
1289         return getpass.getpass(prompt, *args, **kwargs)
1290 else:
1291     compat_getpass = getpass.getpass