youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import math
  12 import os
  13 import pipes
  14 import platform
  15 import re
  16 import ssl
  17 import socket
  18 import subprocess
  19 import sys
  20 import traceback
  21 import zlib
  22
  23 try:
  24     import urllib.request as compat_urllib_request
  25 except ImportError: # Python 2
  26     import urllib2 as compat_urllib_request
  27
  28 try:
  29     import urllib.error as compat_urllib_error
  30 except ImportError: # Python 2
  31     import urllib2 as compat_urllib_error
  32
  33 try:
  34     import urllib.parse as compat_urllib_parse
  35 except ImportError: # Python 2
  36     import urllib as compat_urllib_parse
  37
  38 try:
  39     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  40 except ImportError: # Python 2
  41     from urlparse import urlparse as compat_urllib_parse_urlparse
  42
  43 try:
  44     import urllib.parse as compat_urlparse
  45 except ImportError: # Python 2
  46     import urlparse as compat_urlparse
  47
  48 try:
  49     import http.cookiejar as compat_cookiejar
  50 except ImportError: # Python 2
  51     import cookielib as compat_cookiejar
  52
  53 try:
  54     import html.entities as compat_html_entities
  55 except ImportError: # Python 2
  56     import htmlentitydefs as compat_html_entities
  57
  58 try:
  59     import html.parser as compat_html_parser
  60 except ImportError: # Python 2
  61     import HTMLParser as compat_html_parser
  62
  63 try:
  64     import http.client as compat_http_client
  65 except ImportError: # Python 2
  66     import httplib as compat_http_client
  67
  68 try:
  69     from urllib.error import HTTPError as compat_HTTPError
  70 except ImportError:  # Python 2
  71     from urllib2 import HTTPError as compat_HTTPError
  72
  73 try:
  74     from urllib.request import urlretrieve as compat_urlretrieve
  75 except ImportError:  # Python 2
  76     from urllib import urlretrieve as compat_urlretrieve
  77
  78
  79 try:
  80     from subprocess import DEVNULL
  81     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  82 except ImportError:
  83     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  84
  85 try:
  86     from urllib.parse import parse_qs as compat_parse_qs
  87 except ImportError: # Python 2
  88     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  89     # Python 2's version is apparently totally broken
  90     def _unquote(string, encoding='utf-8', errors='replace'):
  91         if string == '':
  92             return string
  93         res = string.split('%')
  94         if len(res) == 1:
  95             return string
  96         if encoding is None:
  97             encoding = 'utf-8'
  98         if errors is None:
  99             errors = 'replace'
 100         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 101         pct_sequence = b''
 102         string = res[0]
 103         for item in res[1:]:
 104             try:
 105                 if not item:
 106                     raise ValueError
 107                 pct_sequence += item[:2].decode('hex')
 108                 rest = item[2:]
 109                 if not rest:
 110                     # This segment was just a single percent-encoded character.
 111                     # May be part of a sequence of code units, so delay decoding.
 112                     # (Stored in pct_sequence).
 113                     continue
 114             except ValueError:
 115                 rest = '%' + item
 116             # Encountered non-percent-encoded characters. Flush the current
 117             # pct_sequence.
 118             string += pct_sequence.decode(encoding, errors) + rest
 119             pct_sequence = b''
 120         if pct_sequence:
 121             # Flush the final pct_sequence
 122             string += pct_sequence.decode(encoding, errors)
 123         return string
 124
 125     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 126                 encoding='utf-8', errors='replace'):
 127         qs, _coerce_result = qs, unicode
 128         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 129         r = []
 130         for name_value in pairs:
 131             if not name_value and not strict_parsing:
 132                 continue
 133             nv = name_value.split('=', 1)
 134             if len(nv) != 2:
 135                 if strict_parsing:
 136                     raise ValueError("bad query field: %r" % (name_value,))
 137                 # Handle case of a control-name with no equal sign
 138                 if keep_blank_values:
 139                     nv.append('')
 140                 else:
 141                     continue
 142             if len(nv[1]) or keep_blank_values:
 143                 name = nv[0].replace('+', ' ')
 144                 name = _unquote(name, encoding=encoding, errors=errors)
 145                 name = _coerce_result(name)
 146                 value = nv[1].replace('+', ' ')
 147                 value = _unquote(value, encoding=encoding, errors=errors)
 148                 value = _coerce_result(value)
 149                 r.append((name, value))
 150         return r
 151
 152     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 153                 encoding='utf-8', errors='replace'):
 154         parsed_result = {}
 155         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 156                         encoding=encoding, errors=errors)
 157         for name, value in pairs:
 158             if name in parsed_result:
 159                 parsed_result[name].append(value)
 160             else:
 161                 parsed_result[name] = [value]
 162         return parsed_result
 163
 164 try:
 165     compat_str = unicode # Python 2
 166 except NameError:
 167     compat_str = str
 168
 169 try:
 170     compat_chr = unichr # Python 2
 171 except NameError:
 172     compat_chr = chr
 173
 174 def compat_ord(c):
 175     if type(c) is int: return c
 176     else: return ord(c)
 177
 178 # This is not clearly defined otherwise
 179 compiled_regex_type = type(re.compile(''))
 180
 181 std_headers = {
 182     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 183     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 184     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 185     'Accept-Encoding': 'gzip, deflate',
 186     'Accept-Language': 'en-us,en;q=0.5',
 187 }
 188
 189 def preferredencoding():
 190     """Get preferred encoding.
 191
 192     Returns the best encoding scheme for the system, based on
 193     locale.getpreferredencoding() and some further tweaks.
 194     """
 195     try:
 196         pref = locale.getpreferredencoding()
 197         u'TEST'.encode(pref)
 198     except:
 199         pref = 'UTF-8'
 200
 201     return pref
 202
 203 if sys.version_info < (3,0):
 204     def compat_print(s):
 205         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 206 else:
 207     def compat_print(s):
 208         assert type(s) == type(u'')
 209         print(s)
 210
 211 # In Python 2.x, json.dump expects a bytestream.
 212 # In Python 3.x, it writes to a character stream
 213 if sys.version_info < (3,0):
 214     def write_json_file(obj, fn):
 215         with open(fn, 'wb') as f:
 216             json.dump(obj, f)
 217 else:
 218     def write_json_file(obj, fn):
 219         with open(fn, 'w', encoding='utf-8') as f:
 220             json.dump(obj, f)
 221
 222 if sys.version_info >= (2,7):
 223     def find_xpath_attr(node, xpath, key, val):
 224         """ Find the xpath xpath[@key=val] """
 225         assert re.match(r'^[a-zA-Z]+$', key)
 226         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 227         expr = xpath + u"[@%s='%s']" % (key, val)
 228         return node.find(expr)
 229 else:
 230     def find_xpath_attr(node, xpath, key, val):
 231         for f in node.findall(xpath):
 232             if f.attrib.get(key) == val:
 233                 return f
 234         return None
 235
 236 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 237 # the namespace parameter
 238 def xpath_with_ns(path, ns_map):
 239     components = [c.split(':') for c in path.split('/')]
 240     replaced = []
 241     for c in components:
 242         if len(c) == 1:
 243             replaced.append(c[0])
 244         else:
 245             ns, tag = c
 246             replaced.append('{%s}%s' % (ns_map[ns], tag))
 247     return '/'.join(replaced)
 248
 249 def htmlentity_transform(matchobj):
 250     """Transforms an HTML entity to a character.
 251
 252     This function receives a match object and is intended to be used with
 253     the re.sub() function.
 254     """
 255     entity = matchobj.group(1)
 256
 257     # Known non-numeric HTML entity
 258     if entity in compat_html_entities.name2codepoint:
 259         return compat_chr(compat_html_entities.name2codepoint[entity])
 260
 261     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 262     if mobj is not None:
 263         numstr = mobj.group(1)
 264         if numstr.startswith(u'x'):
 265             base = 16
 266             numstr = u'0%s' % numstr
 267         else:
 268             base = 10
 269         return compat_chr(int(numstr, base))
 270
 271     # Unknown entity in name, return its literal representation
 272     return (u'&%s;' % entity)
 273
 274 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 275 class BaseHTMLParser(compat_html_parser.HTMLParser):
 276     def __init(self):
 277         compat_html_parser.HTMLParser.__init__(self)
 278         self.html = None
 279
 280     def loads(self, html):
 281         self.html = html
 282         self.feed(html)
 283         self.close()
 284
 285 class AttrParser(BaseHTMLParser):
 286     """Modified HTMLParser that isolates a tag with the specified attribute"""
 287     def __init__(self, attribute, value):
 288         self.attribute = attribute
 289         self.value = value
 290         self.result = None
 291         self.started = False
 292         self.depth = {}
 293         self.watch_startpos = False
 294         self.error_count = 0
 295         BaseHTMLParser.__init__(self)
 296
 297     def error(self, message):
 298         if self.error_count > 10 or self.started:
 299             raise compat_html_parser.HTMLParseError(message, self.getpos())
 300         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 301         self.error_count += 1
 302         self.goahead(1)
 303
 304     def handle_starttag(self, tag, attrs):
 305         attrs = dict(attrs)
 306         if self.started:
 307             self.find_startpos(None)
 308         if self.attribute in attrs and attrs[self.attribute] == self.value:
 309             self.result = [tag]
 310             self.started = True
 311             self.watch_startpos = True
 312         if self.started:
 313             if not tag in self.depth: self.depth[tag] = 0
 314             self.depth[tag] += 1
 315
 316     def handle_endtag(self, tag):
 317         if self.started:
 318             if tag in self.depth: self.depth[tag] -= 1
 319             if self.depth[self.result[0]] == 0:
 320                 self.started = False
 321                 self.result.append(self.getpos())
 322
 323     def find_startpos(self, x):
 324         """Needed to put the start position of the result (self.result[1])
 325         after the opening tag with the requested id"""
 326         if self.watch_startpos:
 327             self.watch_startpos = False
 328             self.result.append(self.getpos())
 329     handle_entityref = handle_charref = handle_data = handle_comment = \
 330     handle_decl = handle_pi = unknown_decl = find_startpos
 331
 332     def get_result(self):
 333         if self.result is None:
 334             return None
 335         if len(self.result) != 3:
 336             return None
 337         lines = self.html.split('\n')
 338         lines = lines[self.result[1][0]-1:self.result[2][0]]
 339         lines[0] = lines[0][self.result[1][1]:]
 340         if len(lines) == 1:
 341             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 342         lines[-1] = lines[-1][:self.result[2][1]]
 343         return '\n'.join(lines).strip()
 344 # Hack for https://github.com/rg3/youtube-dl/issues/662
 345 if sys.version_info < (2, 7, 3):
 346     AttrParser.parse_endtag = (lambda self, i:
 347         i + len("</scr'+'ipt>")
 348         if self.rawdata[i:].startswith("</scr'+'ipt>")
 349         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 350
 351 def get_element_by_id(id, html):
 352     """Return the content of the tag with the specified ID in the passed HTML document"""
 353     return get_element_by_attribute("id", id, html)
 354
 355 def get_element_by_attribute(attribute, value, html):
 356     """Return the content of the tag with the specified attribute in the passed HTML document"""
 357     parser = AttrParser(attribute, value)
 358     try:
 359         parser.loads(html)
 360     except compat_html_parser.HTMLParseError:
 361         pass
 362     return parser.get_result()
 363
 364 class MetaParser(BaseHTMLParser):
 365     """
 366     Modified HTMLParser that isolates a meta tag with the specified name
 367     attribute.
 368     """
 369     def __init__(self, name):
 370         BaseHTMLParser.__init__(self)
 371         self.name = name
 372         self.content = None
 373         self.result = None
 374
 375     def handle_starttag(self, tag, attrs):
 376         if tag != 'meta':
 377             return
 378         attrs = dict(attrs)
 379         if attrs.get('name') == self.name:
 380             self.result = attrs.get('content')
 381
 382     def get_result(self):
 383         return self.result
 384
 385 def get_meta_content(name, html):
 386     """
 387     Return the content attribute from the meta tag with the given name attribute.
 388     """
 389     parser = MetaParser(name)
 390     try:
 391         parser.loads(html)
 392     except compat_html_parser.HTMLParseError:
 393         pass
 394     return parser.get_result()
 395
 396
 397 def clean_html(html):
 398     """Clean an HTML snippet into a readable string"""
 399     # Newline vs <br />
 400     html = html.replace('\n', ' ')
 401     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 402     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 403     # Strip html tags
 404     html = re.sub('<.*?>', '', html)
 405     # Replace html entities
 406     html = unescapeHTML(html)
 407     return html.strip()
 408
 409
 410 def sanitize_open(filename, open_mode):
 411     """Try to open the given filename, and slightly tweak it if this fails.
 412
 413     Attempts to open the given filename. If this fails, it tries to change
 414     the filename slightly, step by step, until it's either able to open it
 415     or it fails and raises a final exception, like the standard open()
 416     function.
 417
 418     It returns the tuple (stream, definitive_file_name).
 419     """
 420     try:
 421         if filename == u'-':
 422             if sys.platform == 'win32':
 423                 import msvcrt
 424                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 425             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 426         stream = open(encodeFilename(filename), open_mode)
 427         return (stream, filename)
 428     except (IOError, OSError) as err:
 429         if err.errno in (errno.EACCES,):
 430             raise
 431
 432         # In case of error, try to remove win32 forbidden chars
 433         alt_filename = os.path.join(
 434                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 435                         for path_part in os.path.split(filename)
 436                        )
 437         if alt_filename == filename:
 438             raise
 439         else:
 440             # An exception here should be caught in the caller
 441             stream = open(encodeFilename(filename), open_mode)
 442             return (stream, alt_filename)
 443
 444
 445 def timeconvert(timestr):
 446     """Convert RFC 2822 defined time string into system timestamp"""
 447     timestamp = None
 448     timetuple = email.utils.parsedate_tz(timestr)
 449     if timetuple is not None:
 450         timestamp = email.utils.mktime_tz(timetuple)
 451     return timestamp
 452
 453 def sanitize_filename(s, restricted=False, is_id=False):
 454     """Sanitizes a string so it could be used as part of a filename.
 455     If restricted is set, use a stricter subset of allowed characters.
 456     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 457     """
 458     def replace_insane(char):
 459         if char == '?' or ord(char) < 32 or ord(char) == 127:
 460             return ''
 461         elif char == '"':
 462             return '' if restricted else '\''
 463         elif char == ':':
 464             return '_-' if restricted else ' -'
 465         elif char in '\\/|*<>':
 466             return '_'
 467         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 468             return '_'
 469         if restricted and ord(char) > 127:
 470             return '_'
 471         return char
 472
 473     result = u''.join(map(replace_insane, s))
 474     if not is_id:
 475         while '__' in result:
 476             result = result.replace('__', '_')
 477         result = result.strip('_')
 478         # Common case of "Foreign band name - English song title"
 479         if restricted and result.startswith('-_'):
 480             result = result[2:]
 481         if not result:
 482             result = '_'
 483     return result
 484
 485 def orderedSet(iterable):
 486     """ Remove all duplicates from the input iterable """
 487     res = []
 488     for el in iterable:
 489         if el not in res:
 490             res.append(el)
 491     return res
 492
 493 def unescapeHTML(s):
 494     """
 495     @param s a string
 496     """
 497     assert type(s) == type(u'')
 498
 499     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 500     return result
 501
 502 def encodeFilename(s):
 503     """
 504     @param s The name of the file
 505     """
 506
 507     assert type(s) == type(u'')
 508
 509     # Python 3 has a Unicode API
 510     if sys.version_info >= (3, 0):
 511         return s
 512
 513     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 514         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 515         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 516         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 517         return s
 518     else:
 519         encoding = sys.getfilesystemencoding()
 520         if encoding is None:
 521             encoding = 'utf-8'
 522         return s.encode(encoding, 'ignore')
 523
 524 def decodeOption(optval):
 525     if optval is None:
 526         return optval
 527     if isinstance(optval, bytes):
 528         optval = optval.decode(preferredencoding())
 529
 530     assert isinstance(optval, compat_str)
 531     return optval
 532
 533 def formatSeconds(secs):
 534     if secs > 3600:
 535         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 536     elif secs > 60:
 537         return '%d:%02d' % (secs // 60, secs % 60)
 538     else:
 539         return '%d' % secs
 540
 541 def make_HTTPS_handler(opts_no_check_certificate):
 542     if sys.version_info < (3, 2):
 543         import httplib
 544
 545         class HTTPSConnectionV3(httplib.HTTPSConnection):
 546             def __init__(self, *args, **kwargs):
 547                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 548
 549             def connect(self):
 550                 sock = socket.create_connection((self.host, self.port), self.timeout)
 551                 if getattr(self, '_tunnel_host', False):
 552                     self.sock = sock
 553                     self._tunnel()
 554                 try:
 555                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 556                 except ssl.SSLError:
 557                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 558
 559         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 560             def https_open(self, req):
 561                 return self.do_open(HTTPSConnectionV3, req)
 562         return HTTPSHandlerV3()
 563     else:
 564         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 565         context.verify_mode = (ssl.CERT_NONE
 566                                if opts_no_check_certificate
 567                                else ssl.CERT_REQUIRED)
 568         context.set_default_verify_paths()
 569         try:
 570             context.load_default_certs()
 571         except AttributeError:
 572             pass  # Python < 3.4
 573         return compat_urllib_request.HTTPSHandler(context=context)
 574
 575 class ExtractorError(Exception):
 576     """Error during info extraction."""
 577     def __init__(self, msg, tb=None, expected=False, cause=None):
 578         """ tb, if given, is the original traceback (so that it can be printed out).
 579         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 580         """
 581
 582         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 583             expected = True
 584         if not expected:
 585             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 586         super(ExtractorError, self).__init__(msg)
 587
 588         self.traceback = tb
 589         self.exc_info = sys.exc_info()  # preserve original exception
 590         self.cause = cause
 591
 592     def format_traceback(self):
 593         if self.traceback is None:
 594             return None
 595         return u''.join(traceback.format_tb(self.traceback))
 596
 597
 598 class RegexNotFoundError(ExtractorError):
 599     """Error when a regex didn't match"""
 600     pass
 601
 602
 603 class DownloadError(Exception):
 604     """Download Error exception.
 605
 606     This exception may be thrown by FileDownloader objects if they are not
 607     configured to continue on errors. They will contain the appropriate
 608     error message.
 609     """
 610     def __init__(self, msg, exc_info=None):
 611         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 612         super(DownloadError, self).__init__(msg)
 613         self.exc_info = exc_info
 614
 615
 616 class SameFileError(Exception):
 617     """Same File exception.
 618
 619     This exception will be thrown by FileDownloader objects if they detect
 620     multiple files would have to be downloaded to the same file on disk.
 621     """
 622     pass
 623
 624
 625 class PostProcessingError(Exception):
 626     """Post Processing exception.
 627
 628     This exception may be raised by PostProcessor's .run() method to
 629     indicate an error in the postprocessing task.
 630     """
 631     def __init__(self, msg):
 632         self.msg = msg
 633
 634 class MaxDownloadsReached(Exception):
 635     """ --max-downloads limit has been reached. """
 636     pass
 637
 638
 639 class UnavailableVideoError(Exception):
 640     """Unavailable Format exception.
 641
 642     This exception will be thrown when a video is requested
 643     in a format that is not available for that video.
 644     """
 645     pass
 646
 647
 648 class ContentTooShortError(Exception):
 649     """Content Too Short exception.
 650
 651     This exception may be raised by FileDownloader objects when a file they
 652     download is too small for what the server announced first, indicating
 653     the connection was probably interrupted.
 654     """
 655     # Both in bytes
 656     downloaded = None
 657     expected = None
 658
 659     def __init__(self, downloaded, expected):
 660         self.downloaded = downloaded
 661         self.expected = expected
 662
 663 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 664     """Handler for HTTP requests and responses.
 665
 666     This class, when installed with an OpenerDirector, automatically adds
 667     the standard headers to every HTTP request and handles gzipped and
 668     deflated responses from web servers. If compression is to be avoided in
 669     a particular request, the original request in the program code only has
 670     to include the HTTP header "Youtubedl-No-Compression", which will be
 671     removed before making the real request.
 672
 673     Part of this code was copied from:
 674
 675     http://techknack.net/python-urllib2-handlers/
 676
 677     Andrew Rowls, the author of that code, agreed to release it to the
 678     public domain.
 679     """
 680
 681     @staticmethod
 682     def deflate(data):
 683         try:
 684             return zlib.decompress(data, -zlib.MAX_WBITS)
 685         except zlib.error:
 686             return zlib.decompress(data)
 687
 688     @staticmethod
 689     def addinfourl_wrapper(stream, headers, url, code):
 690         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 691             return compat_urllib_request.addinfourl(stream, headers, url, code)
 692         ret = compat_urllib_request.addinfourl(stream, headers, url)
 693         ret.code = code
 694         return ret
 695
 696     def http_request(self, req):
 697         for h,v in std_headers.items():
 698             if h in req.headers:
 699                 del req.headers[h]
 700             req.add_header(h, v)
 701         if 'Youtubedl-no-compression' in req.headers:
 702             if 'Accept-encoding' in req.headers:
 703                 del req.headers['Accept-encoding']
 704             del req.headers['Youtubedl-no-compression']
 705         if 'Youtubedl-user-agent' in req.headers:
 706             if 'User-agent' in req.headers:
 707                 del req.headers['User-agent']
 708             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 709             del req.headers['Youtubedl-user-agent']
 710         return req
 711
 712     def http_response(self, req, resp):
 713         old_resp = resp
 714         # gzip
 715         if resp.headers.get('Content-encoding', '') == 'gzip':
 716             content = resp.read()
 717             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 718             try:
 719                 uncompressed = io.BytesIO(gz.read())
 720             except IOError as original_ioerror:
 721                 # There may be junk add the end of the file
 722                 # See http://stackoverflow.com/q/4928560/35070 for details
 723                 for i in range(1, 1024):
 724                     try:
 725                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 726                         uncompressed = io.BytesIO(gz.read())
 727                     except IOError:
 728                         continue
 729                     break
 730                 else:
 731                     raise original_ioerror
 732             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 733             resp.msg = old_resp.msg
 734         # deflate
 735         if resp.headers.get('Content-encoding', '') == 'deflate':
 736             gz = io.BytesIO(self.deflate(resp.read()))
 737             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 738             resp.msg = old_resp.msg
 739         return resp
 740
 741     https_request = http_request
 742     https_response = http_response
 743
 744 def unified_strdate(date_str):
 745     """Return a string with the date in the format YYYYMMDD"""
 746     upload_date = None
 747     #Replace commas
 748     date_str = date_str.replace(',',' ')
 749     # %z (UTC offset) is only supported in python>=3.2
 750     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 751     format_expressions = [
 752         '%d %B %Y',
 753         '%B %d %Y',
 754         '%b %d %Y',
 755         '%Y-%m-%d',
 756         '%d/%m/%Y',
 757         '%Y/%m/%d %H:%M:%S',
 758         '%d.%m.%Y %H:%M',
 759         '%Y-%m-%dT%H:%M:%SZ',
 760         '%Y-%m-%dT%H:%M:%S.%fZ',
 761         '%Y-%m-%dT%H:%M:%S.%f0Z',
 762         '%Y-%m-%dT%H:%M:%S',
 763     ]
 764     for expression in format_expressions:
 765         try:
 766             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 767         except:
 768             pass
 769     return upload_date
 770
 771 def determine_ext(url, default_ext=u'unknown_video'):
 772     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 773     if re.match(r'^[A-Za-z0-9]+$', guess):
 774         return guess
 775     else:
 776         return default_ext
 777
 778 def subtitles_filename(filename, sub_lang, sub_format):
 779     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 780
 781 def date_from_str(date_str):
 782     """
 783     Return a datetime object from a string in the format YYYYMMDD or
 784     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 785     today = datetime.date.today()
 786     if date_str == 'now'or date_str == 'today':
 787         return today
 788     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 789     if match is not None:
 790         sign = match.group('sign')
 791         time = int(match.group('time'))
 792         if sign == '-':
 793             time = -time
 794         unit = match.group('unit')
 795         #A bad aproximation?
 796         if unit == 'month':
 797             unit = 'day'
 798             time *= 30
 799         elif unit == 'year':
 800             unit = 'day'
 801             time *= 365
 802         unit += 's'
 803         delta = datetime.timedelta(**{unit: time})
 804         return today + delta
 805     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 806
 807 class DateRange(object):
 808     """Represents a time interval between two dates"""
 809     def __init__(self, start=None, end=None):
 810         """start and end must be strings in the format accepted by date"""
 811         if start is not None:
 812             self.start = date_from_str(start)
 813         else:
 814             self.start = datetime.datetime.min.date()
 815         if end is not None:
 816             self.end = date_from_str(end)
 817         else:
 818             self.end = datetime.datetime.max.date()
 819         if self.start > self.end:
 820             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 821     @classmethod
 822     def day(cls, day):
 823         """Returns a range that only contains the given day"""
 824         return cls(day,day)
 825     def __contains__(self, date):
 826         """Check if the date is in the range"""
 827         if not isinstance(date, datetime.date):
 828             date = date_from_str(date)
 829         return self.start <= date <= self.end
 830     def __str__(self):
 831         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 832
 833
 834 def platform_name():
 835     """ Returns the platform name as a compat_str """
 836     res = platform.platform()
 837     if isinstance(res, bytes):
 838         res = res.decode(preferredencoding())
 839
 840     assert isinstance(res, compat_str)
 841     return res
 842
 843
 844 def write_string(s, out=None):
 845     if out is None:
 846         out = sys.stderr
 847     assert type(s) == type(u'')
 848
 849     if ('b' in getattr(out, 'mode', '') or
 850             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 851         s = s.encode(preferredencoding(), 'ignore')
 852     out.write(s)
 853     out.flush()
 854
 855
 856 def bytes_to_intlist(bs):
 857     if not bs:
 858         return []
 859     if isinstance(bs[0], int):  # Python 3
 860         return list(bs)
 861     else:
 862         return [ord(c) for c in bs]
 863
 864
 865 def intlist_to_bytes(xs):
 866     if not xs:
 867         return b''
 868     if isinstance(chr(0), bytes):  # Python 2
 869         return ''.join([chr(x) for x in xs])
 870     else:
 871         return bytes(xs)
 872
 873
 874 def get_cachedir(params={}):
 875     cache_root = os.environ.get('XDG_CACHE_HOME',
 876                                 os.path.expanduser('~/.cache'))
 877     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 878
 879
 880 # Cross-platform file locking
 881 if sys.platform == 'win32':
 882     import ctypes.wintypes
 883     import msvcrt
 884
 885     class OVERLAPPED(ctypes.Structure):
 886         _fields_ = [
 887             ('Internal', ctypes.wintypes.LPVOID),
 888             ('InternalHigh', ctypes.wintypes.LPVOID),
 889             ('Offset', ctypes.wintypes.DWORD),
 890             ('OffsetHigh', ctypes.wintypes.DWORD),
 891             ('hEvent', ctypes.wintypes.HANDLE),
 892         ]
 893
 894     kernel32 = ctypes.windll.kernel32
 895     LockFileEx = kernel32.LockFileEx
 896     LockFileEx.argtypes = [
 897         ctypes.wintypes.HANDLE,     # hFile
 898         ctypes.wintypes.DWORD,      # dwFlags
 899         ctypes.wintypes.DWORD,      # dwReserved
 900         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 901         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 902         ctypes.POINTER(OVERLAPPED)  # Overlapped
 903     ]
 904     LockFileEx.restype = ctypes.wintypes.BOOL
 905     UnlockFileEx = kernel32.UnlockFileEx
 906     UnlockFileEx.argtypes = [
 907         ctypes.wintypes.HANDLE,     # hFile
 908         ctypes.wintypes.DWORD,      # dwReserved
 909         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 910         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 911         ctypes.POINTER(OVERLAPPED)  # Overlapped
 912     ]
 913     UnlockFileEx.restype = ctypes.wintypes.BOOL
 914     whole_low = 0xffffffff
 915     whole_high = 0x7fffffff
 916
 917     def _lock_file(f, exclusive):
 918         overlapped = OVERLAPPED()
 919         overlapped.Offset = 0
 920         overlapped.OffsetHigh = 0
 921         overlapped.hEvent = 0
 922         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 923         handle = msvcrt.get_osfhandle(f.fileno())
 924         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 925                           whole_low, whole_high, f._lock_file_overlapped_p):
 926             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 927
 928     def _unlock_file(f):
 929         assert f._lock_file_overlapped_p
 930         handle = msvcrt.get_osfhandle(f.fileno())
 931         if not UnlockFileEx(handle, 0,
 932                             whole_low, whole_high, f._lock_file_overlapped_p):
 933             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 934
 935 else:
 936     import fcntl
 937
 938     def _lock_file(f, exclusive):
 939         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 940
 941     def _unlock_file(f):
 942         fcntl.lockf(f, fcntl.LOCK_UN)
 943
 944
 945 class locked_file(object):
 946     def __init__(self, filename, mode, encoding=None):
 947         assert mode in ['r', 'a', 'w']
 948         self.f = io.open(filename, mode, encoding=encoding)
 949         self.mode = mode
 950
 951     def __enter__(self):
 952         exclusive = self.mode != 'r'
 953         try:
 954             _lock_file(self.f, exclusive)
 955         except IOError:
 956             self.f.close()
 957             raise
 958         return self
 959
 960     def __exit__(self, etype, value, traceback):
 961         try:
 962             _unlock_file(self.f)
 963         finally:
 964             self.f.close()
 965
 966     def __iter__(self):
 967         return iter(self.f)
 968
 969     def write(self, *args):
 970         return self.f.write(*args)
 971
 972     def read(self, *args):
 973         return self.f.read(*args)
 974
 975
 976 def shell_quote(args):
 977     quoted_args = []
 978     encoding = sys.getfilesystemencoding()
 979     if encoding is None:
 980         encoding = 'utf-8'
 981     for a in args:
 982         if isinstance(a, bytes):
 983             # We may get a filename encoded with 'encodeFilename'
 984             a = a.decode(encoding)
 985         quoted_args.append(pipes.quote(a))
 986     return u' '.join(quoted_args)
 987
 988
 989 def takewhile_inclusive(pred, seq):
 990     """ Like itertools.takewhile, but include the latest evaluated element
 991         (the first element so that Not pred(e)) """
 992     for e in seq:
 993         yield e
 994         if not pred(e):
 995             return
 996
 997
 998 def smuggle_url(url, data):
 999     """ Pass additional data in a URL for internal use. """
1000
1001     sdata = compat_urllib_parse.urlencode(
1002         {u'__youtubedl_smuggle': json.dumps(data)})
1003     return url + u'#' + sdata
1004
1005
1006 def unsmuggle_url(smug_url):
1007     if not '#__youtubedl_smuggle' in smug_url:
1008         return smug_url, None
1009     url, _, sdata = smug_url.rpartition(u'#')
1010     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1011     data = json.loads(jsond)
1012     return url, data
1013
1014
1015 def format_bytes(bytes):
1016     if bytes is None:
1017         return u'N/A'
1018     if type(bytes) is str:
1019         bytes = float(bytes)
1020     if bytes == 0.0:
1021         exponent = 0
1022     else:
1023         exponent = int(math.log(bytes, 1024.0))
1024     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1025     converted = float(bytes) / float(1024 ** exponent)
1026     return u'%.2f%s' % (converted, suffix)
1027
1028
1029 def str_to_int(int_str):
1030     int_str = re.sub(r'[,\.]', u'', int_str)
1031     return int(int_str)
1032
1033
1034 def get_term_width():
1035     columns = os.environ.get('COLUMNS', None)
1036     if columns:
1037         return int(columns)
1038
1039     try:
1040         sp = subprocess.Popen(
1041             ['stty', 'size'],
1042             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1043         out, err = sp.communicate()
1044         return int(out.split()[1])
1045     except:
1046         pass
1047     return None
1048
1049
1050 def month_by_name(name):
1051     """ Return the number of a month by (locale-independently) English name """
1052
1053     ENGLISH_NAMES = [
1054         u'Januar', u'February', u'March', u'April', u'May', u'June',
1055         u'July', u'August', u'September', u'October', u'November', u'December']
1056     try:
1057         return ENGLISH_NAMES.index(name) + 1
1058     except ValueError:
1059         return None
1060
1061
1062 def fix_xml_all_ampersand(xml_str):
1063     """Replace all the '&' by '&amp;' in XML"""
1064     return xml_str.replace(u'&', u'&amp;')