utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import datetime
   5 import email.utils
   6 import errno
   7 import gzip
   8 import io
   9 import json
  10 import locale
  11 import os
  12 import pipes
  13 import platform
  14 import re
  15 import socket
  16 import sys
  17 import traceback
  18 import zlib
  19
  20 try:
  21     import urllib.request as compat_urllib_request
  22 except ImportError: # Python 2
  23     import urllib2 as compat_urllib_request
  24
  25 try:
  26     import urllib.error as compat_urllib_error
  27 except ImportError: # Python 2
  28     import urllib2 as compat_urllib_error
  29
  30 try:
  31     import urllib.parse as compat_urllib_parse
  32 except ImportError: # Python 2
  33     import urllib as compat_urllib_parse
  34
  35 try:
  36     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  37 except ImportError: # Python 2
  38     from urlparse import urlparse as compat_urllib_parse_urlparse
  39
  40 try:
  41     import urllib.parse as compat_urlparse
  42 except ImportError: # Python 2
  43     import urlparse as compat_urlparse
  44
  45 try:
  46     import http.cookiejar as compat_cookiejar
  47 except ImportError: # Python 2
  48     import cookielib as compat_cookiejar
  49
  50 try:
  51     import html.entities as compat_html_entities
  52 except ImportError: # Python 2
  53     import htmlentitydefs as compat_html_entities
  54
  55 try:
  56     import html.parser as compat_html_parser
  57 except ImportError: # Python 2
  58     import HTMLParser as compat_html_parser
  59
  60 try:
  61     import http.client as compat_http_client
  62 except ImportError: # Python 2
  63     import httplib as compat_http_client
  64
  65 try:
  66     from urllib.error import HTTPError as compat_HTTPError
  67 except ImportError:  # Python 2
  68     from urllib2 import HTTPError as compat_HTTPError
  69
  70 try:
  71     from urllib.request import urlretrieve as compat_urlretrieve
  72 except ImportError:  # Python 2
  73     from urllib import urlretrieve as compat_urlretrieve
  74
  75
  76 try:
  77     from subprocess import DEVNULL
  78     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  79 except ImportError:
  80     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  81
  82 try:
  83     from urllib.parse import parse_qs as compat_parse_qs
  84 except ImportError: # Python 2
  85     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  86     # Python 2's version is apparently totally broken
  87     def _unquote(string, encoding='utf-8', errors='replace'):
  88         if string == '':
  89             return string
  90         res = string.split('%')
  91         if len(res) == 1:
  92             return string
  93         if encoding is None:
  94             encoding = 'utf-8'
  95         if errors is None:
  96             errors = 'replace'
  97         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  98         pct_sequence = b''
  99         string = res[0]
 100         for item in res[1:]:
 101             try:
 102                 if not item:
 103                     raise ValueError
 104                 pct_sequence += item[:2].decode('hex')
 105                 rest = item[2:]
 106                 if not rest:
 107                     # This segment was just a single percent-encoded character.
 108                     # May be part of a sequence of code units, so delay decoding.
 109                     # (Stored in pct_sequence).
 110                     continue
 111             except ValueError:
 112                 rest = '%' + item
 113             # Encountered non-percent-encoded characters. Flush the current
 114             # pct_sequence.
 115             string += pct_sequence.decode(encoding, errors) + rest
 116             pct_sequence = b''
 117         if pct_sequence:
 118             # Flush the final pct_sequence
 119             string += pct_sequence.decode(encoding, errors)
 120         return string
 121
 122     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 123                 encoding='utf-8', errors='replace'):
 124         qs, _coerce_result = qs, unicode
 125         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 126         r = []
 127         for name_value in pairs:
 128             if not name_value and not strict_parsing:
 129                 continue
 130             nv = name_value.split('=', 1)
 131             if len(nv) != 2:
 132                 if strict_parsing:
 133                     raise ValueError("bad query field: %r" % (name_value,))
 134                 # Handle case of a control-name with no equal sign
 135                 if keep_blank_values:
 136                     nv.append('')
 137                 else:
 138                     continue
 139             if len(nv[1]) or keep_blank_values:
 140                 name = nv[0].replace('+', ' ')
 141                 name = _unquote(name, encoding=encoding, errors=errors)
 142                 name = _coerce_result(name)
 143                 value = nv[1].replace('+', ' ')
 144                 value = _unquote(value, encoding=encoding, errors=errors)
 145                 value = _coerce_result(value)
 146                 r.append((name, value))
 147         return r
 148
 149     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 150                 encoding='utf-8', errors='replace'):
 151         parsed_result = {}
 152         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 153                         encoding=encoding, errors=errors)
 154         for name, value in pairs:
 155             if name in parsed_result:
 156                 parsed_result[name].append(value)
 157             else:
 158                 parsed_result[name] = [value]
 159         return parsed_result
 160
 161 try:
 162     compat_str = unicode # Python 2
 163 except NameError:
 164     compat_str = str
 165
 166 try:
 167     compat_chr = unichr # Python 2
 168 except NameError:
 169     compat_chr = chr
 170
 171 def compat_ord(c):
 172     if type(c) is int: return c
 173     else: return ord(c)
 174
 175 # This is not clearly defined otherwise
 176 compiled_regex_type = type(re.compile(''))
 177
 178 std_headers = {
 179     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 180     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 181     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 182     'Accept-Encoding': 'gzip, deflate',
 183     'Accept-Language': 'en-us,en;q=0.5',
 184 }
 185
 186 def preferredencoding():
 187     """Get preferred encoding.
 188
 189     Returns the best encoding scheme for the system, based on
 190     locale.getpreferredencoding() and some further tweaks.
 191     """
 192     try:
 193         pref = locale.getpreferredencoding()
 194         u'TEST'.encode(pref)
 195     except:
 196         pref = 'UTF-8'
 197
 198     return pref
 199
 200 if sys.version_info < (3,0):
 201     def compat_print(s):
 202         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 203 else:
 204     def compat_print(s):
 205         assert type(s) == type(u'')
 206         print(s)
 207
 208 # In Python 2.x, json.dump expects a bytestream.
 209 # In Python 3.x, it writes to a character stream
 210 if sys.version_info < (3,0):
 211     def write_json_file(obj, fn):
 212         with open(fn, 'wb') as f:
 213             json.dump(obj, f)
 214 else:
 215     def write_json_file(obj, fn):
 216         with open(fn, 'w', encoding='utf-8') as f:
 217             json.dump(obj, f)
 218
 219 if sys.version_info >= (2,7):
 220     def find_xpath_attr(node, xpath, key, val):
 221         """ Find the xpath xpath[@key=val] """
 222         assert re.match(r'^[a-zA-Z]+$', key)
 223         assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
 224         expr = xpath + u"[@%s='%s']" % (key, val)
 225         return node.find(expr)
 226 else:
 227     def find_xpath_attr(node, xpath, key, val):
 228         for f in node.findall(xpath):
 229             if f.attrib.get(key) == val:
 230                 return f
 231         return None
 232
 233 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 234 # the namespace parameter
 235 def xpath_with_ns(path, ns_map):
 236     components = [c.split(':') for c in path.split('/')]
 237     replaced = []
 238     for c in components:
 239         if len(c) == 1:
 240             replaced.append(c[0])
 241         else:
 242             ns, tag = c
 243             replaced.append('{%s}%s' % (ns_map[ns], tag))
 244     return '/'.join(replaced)
 245
 246 def htmlentity_transform(matchobj):
 247     """Transforms an HTML entity to a character.
 248
 249     This function receives a match object and is intended to be used with
 250     the re.sub() function.
 251     """
 252     entity = matchobj.group(1)
 253
 254     # Known non-numeric HTML entity
 255     if entity in compat_html_entities.name2codepoint:
 256         return compat_chr(compat_html_entities.name2codepoint[entity])
 257
 258     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 259     if mobj is not None:
 260         numstr = mobj.group(1)
 261         if numstr.startswith(u'x'):
 262             base = 16
 263             numstr = u'0%s' % numstr
 264         else:
 265             base = 10
 266         return compat_chr(int(numstr, base))
 267
 268     # Unknown entity in name, return its literal representation
 269     return (u'&%s;' % entity)
 270
 271 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 272 class BaseHTMLParser(compat_html_parser.HTMLParser):
 273     def __init(self):
 274         compat_html_parser.HTMLParser.__init__(self)
 275         self.html = None
 276
 277     def loads(self, html):
 278         self.html = html
 279         self.feed(html)
 280         self.close()
 281
 282 class AttrParser(BaseHTMLParser):
 283     """Modified HTMLParser that isolates a tag with the specified attribute"""
 284     def __init__(self, attribute, value):
 285         self.attribute = attribute
 286         self.value = value
 287         self.result = None
 288         self.started = False
 289         self.depth = {}
 290         self.watch_startpos = False
 291         self.error_count = 0
 292         BaseHTMLParser.__init__(self)
 293
 294     def error(self, message):
 295         if self.error_count > 10 or self.started:
 296             raise compat_html_parser.HTMLParseError(message, self.getpos())
 297         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 298         self.error_count += 1
 299         self.goahead(1)
 300
 301     def handle_starttag(self, tag, attrs):
 302         attrs = dict(attrs)
 303         if self.started:
 304             self.find_startpos(None)
 305         if self.attribute in attrs and attrs[self.attribute] == self.value:
 306             self.result = [tag]
 307             self.started = True
 308             self.watch_startpos = True
 309         if self.started:
 310             if not tag in self.depth: self.depth[tag] = 0
 311             self.depth[tag] += 1
 312
 313     def handle_endtag(self, tag):
 314         if self.started:
 315             if tag in self.depth: self.depth[tag] -= 1
 316             if self.depth[self.result[0]] == 0:
 317                 self.started = False
 318                 self.result.append(self.getpos())
 319
 320     def find_startpos(self, x):
 321         """Needed to put the start position of the result (self.result[1])
 322         after the opening tag with the requested id"""
 323         if self.watch_startpos:
 324             self.watch_startpos = False
 325             self.result.append(self.getpos())
 326     handle_entityref = handle_charref = handle_data = handle_comment = \
 327     handle_decl = handle_pi = unknown_decl = find_startpos
 328
 329     def get_result(self):
 330         if self.result is None:
 331             return None
 332         if len(self.result) != 3:
 333             return None
 334         lines = self.html.split('\n')
 335         lines = lines[self.result[1][0]-1:self.result[2][0]]
 336         lines[0] = lines[0][self.result[1][1]:]
 337         if len(lines) == 1:
 338             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 339         lines[-1] = lines[-1][:self.result[2][1]]
 340         return '\n'.join(lines).strip()
 341 # Hack for https://github.com/rg3/youtube-dl/issues/662
 342 if sys.version_info < (2, 7, 3):
 343     AttrParser.parse_endtag = (lambda self, i:
 344         i + len("</scr'+'ipt>")
 345         if self.rawdata[i:].startswith("</scr'+'ipt>")
 346         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 347
 348 def get_element_by_id(id, html):
 349     """Return the content of the tag with the specified ID in the passed HTML document"""
 350     return get_element_by_attribute("id", id, html)
 351
 352 def get_element_by_attribute(attribute, value, html):
 353     """Return the content of the tag with the specified attribute in the passed HTML document"""
 354     parser = AttrParser(attribute, value)
 355     try:
 356         parser.loads(html)
 357     except compat_html_parser.HTMLParseError:
 358         pass
 359     return parser.get_result()
 360
 361 class MetaParser(BaseHTMLParser):
 362     """
 363     Modified HTMLParser that isolates a meta tag with the specified name
 364     attribute.
 365     """
 366     def __init__(self, name):
 367         BaseHTMLParser.__init__(self)
 368         self.name = name
 369         self.content = None
 370         self.result = None
 371
 372     def handle_starttag(self, tag, attrs):
 373         if tag != 'meta':
 374             return
 375         attrs = dict(attrs)
 376         if attrs.get('name') == self.name:
 377             self.result = attrs.get('content')
 378
 379     def get_result(self):
 380         return self.result
 381
 382 def get_meta_content(name, html):
 383     """
 384     Return the content attribute from the meta tag with the given name attribute.
 385     """
 386     parser = MetaParser(name)
 387     try:
 388         parser.loads(html)
 389     except compat_html_parser.HTMLParseError:
 390         pass
 391     return parser.get_result()
 392
 393
 394 def clean_html(html):
 395     """Clean an HTML snippet into a readable string"""
 396     # Newline vs <br />
 397     html = html.replace('\n', ' ')
 398     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 399     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 400     # Strip html tags
 401     html = re.sub('<.*?>', '', html)
 402     # Replace html entities
 403     html = unescapeHTML(html)
 404     return html.strip()
 405
 406
 407 def sanitize_open(filename, open_mode):
 408     """Try to open the given filename, and slightly tweak it if this fails.
 409
 410     Attempts to open the given filename. If this fails, it tries to change
 411     the filename slightly, step by step, until it's either able to open it
 412     or it fails and raises a final exception, like the standard open()
 413     function.
 414
 415     It returns the tuple (stream, definitive_file_name).
 416     """
 417     try:
 418         if filename == u'-':
 419             if sys.platform == 'win32':
 420                 import msvcrt
 421                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 422             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 423         stream = open(encodeFilename(filename), open_mode)
 424         return (stream, filename)
 425     except (IOError, OSError) as err:
 426         if err.errno in (errno.EACCES,):
 427             raise
 428
 429         # In case of error, try to remove win32 forbidden chars
 430         alt_filename = os.path.join(
 431                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 432                         for path_part in os.path.split(filename)
 433                        )
 434         if alt_filename == filename:
 435             raise
 436         else:
 437             # An exception here should be caught in the caller
 438             stream = open(encodeFilename(filename), open_mode)
 439             return (stream, alt_filename)
 440
 441
 442 def timeconvert(timestr):
 443     """Convert RFC 2822 defined time string into system timestamp"""
 444     timestamp = None
 445     timetuple = email.utils.parsedate_tz(timestr)
 446     if timetuple is not None:
 447         timestamp = email.utils.mktime_tz(timetuple)
 448     return timestamp
 449
 450 def sanitize_filename(s, restricted=False, is_id=False):
 451     """Sanitizes a string so it could be used as part of a filename.
 452     If restricted is set, use a stricter subset of allowed characters.
 453     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 454     """
 455     def replace_insane(char):
 456         if char == '?' or ord(char) < 32 or ord(char) == 127:
 457             return ''
 458         elif char == '"':
 459             return '' if restricted else '\''
 460         elif char == ':':
 461             return '_-' if restricted else ' -'
 462         elif char in '\\/|*<>':
 463             return '_'
 464         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 465             return '_'
 466         if restricted and ord(char) > 127:
 467             return '_'
 468         return char
 469
 470     result = u''.join(map(replace_insane, s))
 471     if not is_id:
 472         while '__' in result:
 473             result = result.replace('__', '_')
 474         result = result.strip('_')
 475         # Common case of "Foreign band name - English song title"
 476         if restricted and result.startswith('-_'):
 477             result = result[2:]
 478         if not result:
 479             result = '_'
 480     return result
 481
 482 def orderedSet(iterable):
 483     """ Remove all duplicates from the input iterable """
 484     res = []
 485     for el in iterable:
 486         if el not in res:
 487             res.append(el)
 488     return res
 489
 490 def unescapeHTML(s):
 491     """
 492     @param s a string
 493     """
 494     assert type(s) == type(u'')
 495
 496     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 497     return result
 498
 499 def encodeFilename(s):
 500     """
 501     @param s The name of the file
 502     """
 503
 504     assert type(s) == type(u'')
 505
 506     # Python 3 has a Unicode API
 507     if sys.version_info >= (3, 0):
 508         return s
 509
 510     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 511         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 512         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 513         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 514         return s
 515     else:
 516         encoding = sys.getfilesystemencoding()
 517         if encoding is None:
 518             encoding = 'utf-8'
 519         return s.encode(encoding, 'ignore')
 520
 521 def decodeOption(optval):
 522     if optval is None:
 523         return optval
 524     if isinstance(optval, bytes):
 525         optval = optval.decode(preferredencoding())
 526
 527     assert isinstance(optval, compat_str)
 528     return optval
 529
 530 def formatSeconds(secs):
 531     if secs > 3600:
 532         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 533     elif secs > 60:
 534         return '%d:%02d' % (secs // 60, secs % 60)
 535     else:
 536         return '%d' % secs
 537
 538 def make_HTTPS_handler(opts):
 539     if sys.version_info < (3,2):
 540         # Python's 2.x handler is very simplistic
 541         return compat_urllib_request.HTTPSHandler()
 542     else:
 543         import ssl
 544         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 545         context.set_default_verify_paths()
 546
 547         context.verify_mode = (ssl.CERT_NONE
 548                                if opts.no_check_certificate
 549                                else ssl.CERT_REQUIRED)
 550         return compat_urllib_request.HTTPSHandler(context=context)
 551
 552 class ExtractorError(Exception):
 553     """Error during info extraction."""
 554     def __init__(self, msg, tb=None, expected=False, cause=None):
 555         """ tb, if given, is the original traceback (so that it can be printed out).
 556         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 557         """
 558
 559         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 560             expected = True
 561         if not expected:
 562             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 563         super(ExtractorError, self).__init__(msg)
 564
 565         self.traceback = tb
 566         self.exc_info = sys.exc_info()  # preserve original exception
 567         self.cause = cause
 568
 569     def format_traceback(self):
 570         if self.traceback is None:
 571             return None
 572         return u''.join(traceback.format_tb(self.traceback))
 573
 574
 575 class RegexNotFoundError(ExtractorError):
 576     """Error when a regex didn't match"""
 577     pass
 578
 579
 580 class DownloadError(Exception):
 581     """Download Error exception.
 582
 583     This exception may be thrown by FileDownloader objects if they are not
 584     configured to continue on errors. They will contain the appropriate
 585     error message.
 586     """
 587     def __init__(self, msg, exc_info=None):
 588         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 589         super(DownloadError, self).__init__(msg)
 590         self.exc_info = exc_info
 591
 592
 593 class SameFileError(Exception):
 594     """Same File exception.
 595
 596     This exception will be thrown by FileDownloader objects if they detect
 597     multiple files would have to be downloaded to the same file on disk.
 598     """
 599     pass
 600
 601
 602 class PostProcessingError(Exception):
 603     """Post Processing exception.
 604
 605     This exception may be raised by PostProcessor's .run() method to
 606     indicate an error in the postprocessing task.
 607     """
 608     def __init__(self, msg):
 609         self.msg = msg
 610
 611 class MaxDownloadsReached(Exception):
 612     """ --max-downloads limit has been reached. """
 613     pass
 614
 615
 616 class UnavailableVideoError(Exception):
 617     """Unavailable Format exception.
 618
 619     This exception will be thrown when a video is requested
 620     in a format that is not available for that video.
 621     """
 622     pass
 623
 624
 625 class ContentTooShortError(Exception):
 626     """Content Too Short exception.
 627
 628     This exception may be raised by FileDownloader objects when a file they
 629     download is too small for what the server announced first, indicating
 630     the connection was probably interrupted.
 631     """
 632     # Both in bytes
 633     downloaded = None
 634     expected = None
 635
 636     def __init__(self, downloaded, expected):
 637         self.downloaded = downloaded
 638         self.expected = expected
 639
 640 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 641     """Handler for HTTP requests and responses.
 642
 643     This class, when installed with an OpenerDirector, automatically adds
 644     the standard headers to every HTTP request and handles gzipped and
 645     deflated responses from web servers. If compression is to be avoided in
 646     a particular request, the original request in the program code only has
 647     to include the HTTP header "Youtubedl-No-Compression", which will be
 648     removed before making the real request.
 649
 650     Part of this code was copied from:
 651
 652     http://techknack.net/python-urllib2-handlers/
 653
 654     Andrew Rowls, the author of that code, agreed to release it to the
 655     public domain.
 656     """
 657
 658     @staticmethod
 659     def deflate(data):
 660         try:
 661             return zlib.decompress(data, -zlib.MAX_WBITS)
 662         except zlib.error:
 663             return zlib.decompress(data)
 664
 665     @staticmethod
 666     def addinfourl_wrapper(stream, headers, url, code):
 667         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 668             return compat_urllib_request.addinfourl(stream, headers, url, code)
 669         ret = compat_urllib_request.addinfourl(stream, headers, url)
 670         ret.code = code
 671         return ret
 672
 673     def http_request(self, req):
 674         for h,v in std_headers.items():
 675             if h in req.headers:
 676                 del req.headers[h]
 677             req.add_header(h, v)
 678         if 'Youtubedl-no-compression' in req.headers:
 679             if 'Accept-encoding' in req.headers:
 680                 del req.headers['Accept-encoding']
 681             del req.headers['Youtubedl-no-compression']
 682         if 'Youtubedl-user-agent' in req.headers:
 683             if 'User-agent' in req.headers:
 684                 del req.headers['User-agent']
 685             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 686             del req.headers['Youtubedl-user-agent']
 687         return req
 688
 689     def http_response(self, req, resp):
 690         old_resp = resp
 691         # gzip
 692         if resp.headers.get('Content-encoding', '') == 'gzip':
 693             content = resp.read()
 694             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 695             try:
 696                 uncompressed = io.BytesIO(gz.read())
 697             except IOError as original_ioerror:
 698                 # There may be junk add the end of the file
 699                 # See http://stackoverflow.com/q/4928560/35070 for details
 700                 for i in range(1, 1024):
 701                     try:
 702                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 703                         uncompressed = io.BytesIO(gz.read())
 704                     except IOError:
 705                         continue
 706                     break
 707                 else:
 708                     raise original_ioerror
 709             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 710             resp.msg = old_resp.msg
 711         # deflate
 712         if resp.headers.get('Content-encoding', '') == 'deflate':
 713             gz = io.BytesIO(self.deflate(resp.read()))
 714             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 715             resp.msg = old_resp.msg
 716         return resp
 717
 718     https_request = http_request
 719     https_response = http_response
 720
 721 def unified_strdate(date_str):
 722     """Return a string with the date in the format YYYYMMDD"""
 723     upload_date = None
 724     #Replace commas
 725     date_str = date_str.replace(',',' ')
 726     # %z (UTC offset) is only supported in python>=3.2
 727     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 728     format_expressions = [
 729         '%d %B %Y',
 730         '%B %d %Y',
 731         '%b %d %Y',
 732         '%Y-%m-%d',
 733         '%d/%m/%Y',
 734         '%Y/%m/%d %H:%M:%S',
 735         '%d.%m.%Y %H:%M',
 736         '%Y-%m-%dT%H:%M:%SZ',
 737         '%Y-%m-%dT%H:%M:%S.%fZ',
 738         '%Y-%m-%dT%H:%M:%S.%f0Z',
 739         '%Y-%m-%dT%H:%M:%S',
 740     ]
 741     for expression in format_expressions:
 742         try:
 743             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 744         except:
 745             pass
 746     return upload_date
 747
 748 def determine_ext(url, default_ext=u'unknown_video'):
 749     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 750     if re.match(r'^[A-Za-z0-9]+$', guess):
 751         return guess
 752     else:
 753         return default_ext
 754
 755 def subtitles_filename(filename, sub_lang, sub_format):
 756     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 757
 758 def date_from_str(date_str):
 759     """
 760     Return a datetime object from a string in the format YYYYMMDD or
 761     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 762     today = datetime.date.today()
 763     if date_str == 'now'or date_str == 'today':
 764         return today
 765     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 766     if match is not None:
 767         sign = match.group('sign')
 768         time = int(match.group('time'))
 769         if sign == '-':
 770             time = -time
 771         unit = match.group('unit')
 772         #A bad aproximation?
 773         if unit == 'month':
 774             unit = 'day'
 775             time *= 30
 776         elif unit == 'year':
 777             unit = 'day'
 778             time *= 365
 779         unit += 's'
 780         delta = datetime.timedelta(**{unit: time})
 781         return today + delta
 782     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 783
 784 class DateRange(object):
 785     """Represents a time interval between two dates"""
 786     def __init__(self, start=None, end=None):
 787         """start and end must be strings in the format accepted by date"""
 788         if start is not None:
 789             self.start = date_from_str(start)
 790         else:
 791             self.start = datetime.datetime.min.date()
 792         if end is not None:
 793             self.end = date_from_str(end)
 794         else:
 795             self.end = datetime.datetime.max.date()
 796         if self.start > self.end:
 797             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 798     @classmethod
 799     def day(cls, day):
 800         """Returns a range that only contains the given day"""
 801         return cls(day,day)
 802     def __contains__(self, date):
 803         """Check if the date is in the range"""
 804         if not isinstance(date, datetime.date):
 805             date = date_from_str(date)
 806         return self.start <= date <= self.end
 807     def __str__(self):
 808         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 809
 810
 811 def platform_name():
 812     """ Returns the platform name as a compat_str """
 813     res = platform.platform()
 814     if isinstance(res, bytes):
 815         res = res.decode(preferredencoding())
 816
 817     assert isinstance(res, compat_str)
 818     return res
 819
 820
 821 def write_string(s, out=None):
 822     if out is None:
 823         out = sys.stderr
 824     assert type(s) == type(u'')
 825
 826     if ('b' in getattr(out, 'mode', '') or
 827             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 828         s = s.encode(preferredencoding(), 'ignore')
 829     out.write(s)
 830     out.flush()
 831
 832
 833 def bytes_to_intlist(bs):
 834     if not bs:
 835         return []
 836     if isinstance(bs[0], int):  # Python 3
 837         return list(bs)
 838     else:
 839         return [ord(c) for c in bs]
 840
 841
 842 def intlist_to_bytes(xs):
 843     if not xs:
 844         return b''
 845     if isinstance(chr(0), bytes):  # Python 2
 846         return ''.join([chr(x) for x in xs])
 847     else:
 848         return bytes(xs)
 849
 850
 851 def get_cachedir(params={}):
 852     cache_root = os.environ.get('XDG_CACHE_HOME',
 853                                 os.path.expanduser('~/.cache'))
 854     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 855
 856
 857 # Cross-platform file locking
 858 if sys.platform == 'win32':
 859     import ctypes.wintypes
 860     import msvcrt
 861
 862     class OVERLAPPED(ctypes.Structure):
 863         _fields_ = [
 864             ('Internal', ctypes.wintypes.LPVOID),
 865             ('InternalHigh', ctypes.wintypes.LPVOID),
 866             ('Offset', ctypes.wintypes.DWORD),
 867             ('OffsetHigh', ctypes.wintypes.DWORD),
 868             ('hEvent', ctypes.wintypes.HANDLE),
 869         ]
 870
 871     kernel32 = ctypes.windll.kernel32
 872     LockFileEx = kernel32.LockFileEx
 873     LockFileEx.argtypes = [
 874         ctypes.wintypes.HANDLE,     # hFile
 875         ctypes.wintypes.DWORD,      # dwFlags
 876         ctypes.wintypes.DWORD,      # dwReserved
 877         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 878         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 879         ctypes.POINTER(OVERLAPPED)  # Overlapped
 880     ]
 881     LockFileEx.restype = ctypes.wintypes.BOOL
 882     UnlockFileEx = kernel32.UnlockFileEx
 883     UnlockFileEx.argtypes = [
 884         ctypes.wintypes.HANDLE,     # hFile
 885         ctypes.wintypes.DWORD,      # dwReserved
 886         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 887         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 888         ctypes.POINTER(OVERLAPPED)  # Overlapped
 889     ]
 890     UnlockFileEx.restype = ctypes.wintypes.BOOL
 891     whole_low = 0xffffffff
 892     whole_high = 0x7fffffff
 893
 894     def _lock_file(f, exclusive):
 895         overlapped = OVERLAPPED()
 896         overlapped.Offset = 0
 897         overlapped.OffsetHigh = 0
 898         overlapped.hEvent = 0
 899         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 900         handle = msvcrt.get_osfhandle(f.fileno())
 901         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 902                           whole_low, whole_high, f._lock_file_overlapped_p):
 903             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 904
 905     def _unlock_file(f):
 906         assert f._lock_file_overlapped_p
 907         handle = msvcrt.get_osfhandle(f.fileno())
 908         if not UnlockFileEx(handle, 0,
 909                             whole_low, whole_high, f._lock_file_overlapped_p):
 910             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 911
 912 else:
 913     import fcntl
 914
 915     def _lock_file(f, exclusive):
 916         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 917
 918     def _unlock_file(f):
 919         fcntl.lockf(f, fcntl.LOCK_UN)
 920
 921
 922 class locked_file(object):
 923     def __init__(self, filename, mode, encoding=None):
 924         assert mode in ['r', 'a', 'w']
 925         self.f = io.open(filename, mode, encoding=encoding)
 926         self.mode = mode
 927
 928     def __enter__(self):
 929         exclusive = self.mode != 'r'
 930         try:
 931             _lock_file(self.f, exclusive)
 932         except IOError:
 933             self.f.close()
 934             raise
 935         return self
 936
 937     def __exit__(self, etype, value, traceback):
 938         try:
 939             _unlock_file(self.f)
 940         finally:
 941             self.f.close()
 942
 943     def __iter__(self):
 944         return iter(self.f)
 945
 946     def write(self, *args):
 947         return self.f.write(*args)
 948
 949     def read(self, *args):
 950         return self.f.read(*args)
 951
 952
 953 def shell_quote(args):
 954     quoted_args = []
 955     encoding = sys.getfilesystemencoding()
 956     if encoding is None:
 957         encoding = 'utf-8'
 958     for a in args:
 959         if isinstance(a, bytes):
 960             # We may get a filename encoded with 'encodeFilename'
 961             a = a.decode(encoding)
 962         quoted_args.append(pipes.quote(a))
 963     return u' '.join(quoted_args)
 964
 965
 966 def takewhile_inclusive(pred, seq):
 967     """ Like itertools.takewhile, but include the latest evaluated element
 968         (the first element so that Not pred(e)) """
 969     for e in seq:
 970         yield e
 971         if not pred(e):
 972             return
 973
 974
 975 def smuggle_url(url, data):
 976     """ Pass additional data in a URL for internal use. """
 977
 978     sdata = compat_urllib_parse.urlencode(
 979         {u'__youtubedl_smuggle': json.dumps(data)})
 980     return url + u'#' + sdata
 981
 982
 983 def unsmuggle_url(smug_url):
 984     if not '#__youtubedl_smuggle' in smug_url:
 985         return smug_url, None
 986     url, _, sdata = smug_url.rpartition(u'#')
 987     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
 988     data = json.loads(jsond)
 989     return url, data