youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import contextlib
   5 import ctypes
   6 import datetime
   7 import email.utils
   8 import errno
   9 import gzip
  10 import itertools
  11 import io
  12 import json
  13 import locale
  14 import math
  15 import os
  16 import pipes
  17 import platform
  18 import re
  19 import ssl
  20 import socket
  21 import struct
  22 import subprocess
  23 import sys
  24 import traceback
  25 import zlib
  26
  27 try:
  28     import urllib.request as compat_urllib_request
  29 except ImportError: # Python 2
  30     import urllib2 as compat_urllib_request
  31
  32 try:
  33     import urllib.error as compat_urllib_error
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_error
  36
  37 try:
  38     import urllib.parse as compat_urllib_parse
  39 except ImportError: # Python 2
  40     import urllib as compat_urllib_parse
  41
  42 try:
  43     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  44 except ImportError: # Python 2
  45     from urlparse import urlparse as compat_urllib_parse_urlparse
  46
  47 try:
  48     import urllib.parse as compat_urlparse
  49 except ImportError: # Python 2
  50     import urlparse as compat_urlparse
  51
  52 try:
  53     import http.cookiejar as compat_cookiejar
  54 except ImportError: # Python 2
  55     import cookielib as compat_cookiejar
  56
  57 try:
  58     import html.entities as compat_html_entities
  59 except ImportError: # Python 2
  60     import htmlentitydefs as compat_html_entities
  61
  62 try:
  63     import html.parser as compat_html_parser
  64 except ImportError: # Python 2
  65     import HTMLParser as compat_html_parser
  66
  67 try:
  68     import http.client as compat_http_client
  69 except ImportError: # Python 2
  70     import httplib as compat_http_client
  71
  72 try:
  73     from urllib.error import HTTPError as compat_HTTPError
  74 except ImportError:  # Python 2
  75     from urllib2 import HTTPError as compat_HTTPError
  76
  77 try:
  78     from urllib.request import urlretrieve as compat_urlretrieve
  79 except ImportError:  # Python 2
  80     from urllib import urlretrieve as compat_urlretrieve
  81
  82
  83 try:
  84     from subprocess import DEVNULL
  85     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  86 except ImportError:
  87     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  88
  89 try:
  90     from urllib.parse import parse_qs as compat_parse_qs
  91 except ImportError: # Python 2
  92     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  93     # Python 2's version is apparently totally broken
  94     def _unquote(string, encoding='utf-8', errors='replace'):
  95         if string == '':
  96             return string
  97         res = string.split('%')
  98         if len(res) == 1:
  99             return string
 100         if encoding is None:
 101             encoding = 'utf-8'
 102         if errors is None:
 103             errors = 'replace'
 104         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 105         pct_sequence = b''
 106         string = res[0]
 107         for item in res[1:]:
 108             try:
 109                 if not item:
 110                     raise ValueError
 111                 pct_sequence += item[:2].decode('hex')
 112                 rest = item[2:]
 113                 if not rest:
 114                     # This segment was just a single percent-encoded character.
 115                     # May be part of a sequence of code units, so delay decoding.
 116                     # (Stored in pct_sequence).
 117                     continue
 118             except ValueError:
 119                 rest = '%' + item
 120             # Encountered non-percent-encoded characters. Flush the current
 121             # pct_sequence.
 122             string += pct_sequence.decode(encoding, errors) + rest
 123             pct_sequence = b''
 124         if pct_sequence:
 125             # Flush the final pct_sequence
 126             string += pct_sequence.decode(encoding, errors)
 127         return string
 128
 129     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 130                 encoding='utf-8', errors='replace'):
 131         qs, _coerce_result = qs, unicode
 132         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 133         r = []
 134         for name_value in pairs:
 135             if not name_value and not strict_parsing:
 136                 continue
 137             nv = name_value.split('=', 1)
 138             if len(nv) != 2:
 139                 if strict_parsing:
 140                     raise ValueError("bad query field: %r" % (name_value,))
 141                 # Handle case of a control-name with no equal sign
 142                 if keep_blank_values:
 143                     nv.append('')
 144                 else:
 145                     continue
 146             if len(nv[1]) or keep_blank_values:
 147                 name = nv[0].replace('+', ' ')
 148                 name = _unquote(name, encoding=encoding, errors=errors)
 149                 name = _coerce_result(name)
 150                 value = nv[1].replace('+', ' ')
 151                 value = _unquote(value, encoding=encoding, errors=errors)
 152                 value = _coerce_result(value)
 153                 r.append((name, value))
 154         return r
 155
 156     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 157                 encoding='utf-8', errors='replace'):
 158         parsed_result = {}
 159         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 160                         encoding=encoding, errors=errors)
 161         for name, value in pairs:
 162             if name in parsed_result:
 163                 parsed_result[name].append(value)
 164             else:
 165                 parsed_result[name] = [value]
 166         return parsed_result
 167
 168 try:
 169     compat_str = unicode # Python 2
 170 except NameError:
 171     compat_str = str
 172
 173 try:
 174     compat_chr = unichr # Python 2
 175 except NameError:
 176     compat_chr = chr
 177
 178 try:
 179     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 180 except ImportError:  # Python 2.6
 181     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 182
 183 def compat_ord(c):
 184     if type(c) is int: return c
 185     else: return ord(c)
 186
 187 # This is not clearly defined otherwise
 188 compiled_regex_type = type(re.compile(''))
 189
 190 std_headers = {
 191     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 192     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 193     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 194     'Accept-Encoding': 'gzip, deflate',
 195     'Accept-Language': 'en-us,en;q=0.5',
 196 }
 197
 198 def preferredencoding():
 199     """Get preferred encoding.
 200
 201     Returns the best encoding scheme for the system, based on
 202     locale.getpreferredencoding() and some further tweaks.
 203     """
 204     try:
 205         pref = locale.getpreferredencoding()
 206         u'TEST'.encode(pref)
 207     except:
 208         pref = 'UTF-8'
 209
 210     return pref
 211
 212 if sys.version_info < (3,0):
 213     def compat_print(s):
 214         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 215 else:
 216     def compat_print(s):
 217         assert type(s) == type(u'')
 218         print(s)
 219
 220 # In Python 2.x, json.dump expects a bytestream.
 221 # In Python 3.x, it writes to a character stream
 222 if sys.version_info < (3,0):
 223     def write_json_file(obj, fn):
 224         with open(fn, 'wb') as f:
 225             json.dump(obj, f)
 226 else:
 227     def write_json_file(obj, fn):
 228         with open(fn, 'w', encoding='utf-8') as f:
 229             json.dump(obj, f)
 230
 231 if sys.version_info >= (2,7):
 232     def find_xpath_attr(node, xpath, key, val):
 233         """ Find the xpath xpath[@key=val] """
 234         assert re.match(r'^[a-zA-Z]+$', key)
 235         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 236         expr = xpath + u"[@%s='%s']" % (key, val)
 237         return node.find(expr)
 238 else:
 239     def find_xpath_attr(node, xpath, key, val):
 240         for f in node.findall(xpath):
 241             if f.attrib.get(key) == val:
 242                 return f
 243         return None
 244
 245 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 246 # the namespace parameter
 247 def xpath_with_ns(path, ns_map):
 248     components = [c.split(':') for c in path.split('/')]
 249     replaced = []
 250     for c in components:
 251         if len(c) == 1:
 252             replaced.append(c[0])
 253         else:
 254             ns, tag = c
 255             replaced.append('{%s}%s' % (ns_map[ns], tag))
 256     return '/'.join(replaced)
 257
 258 def htmlentity_transform(matchobj):
 259     """Transforms an HTML entity to a character.
 260
 261     This function receives a match object and is intended to be used with
 262     the re.sub() function.
 263     """
 264     entity = matchobj.group(1)
 265
 266     # Known non-numeric HTML entity
 267     if entity in compat_html_entities.name2codepoint:
 268         return compat_chr(compat_html_entities.name2codepoint[entity])
 269
 270     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 271     if mobj is not None:
 272         numstr = mobj.group(1)
 273         if numstr.startswith(u'x'):
 274             base = 16
 275             numstr = u'0%s' % numstr
 276         else:
 277             base = 10
 278         return compat_chr(int(numstr, base))
 279
 280     # Unknown entity in name, return its literal representation
 281     return (u'&%s;' % entity)
 282
 283 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 284 class BaseHTMLParser(compat_html_parser.HTMLParser):
 285     def __init(self):
 286         compat_html_parser.HTMLParser.__init__(self)
 287         self.html = None
 288
 289     def loads(self, html):
 290         self.html = html
 291         self.feed(html)
 292         self.close()
 293
 294 class AttrParser(BaseHTMLParser):
 295     """Modified HTMLParser that isolates a tag with the specified attribute"""
 296     def __init__(self, attribute, value):
 297         self.attribute = attribute
 298         self.value = value
 299         self.result = None
 300         self.started = False
 301         self.depth = {}
 302         self.watch_startpos = False
 303         self.error_count = 0
 304         BaseHTMLParser.__init__(self)
 305
 306     def error(self, message):
 307         if self.error_count > 10 or self.started:
 308             raise compat_html_parser.HTMLParseError(message, self.getpos())
 309         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 310         self.error_count += 1
 311         self.goahead(1)
 312
 313     def handle_starttag(self, tag, attrs):
 314         attrs = dict(attrs)
 315         if self.started:
 316             self.find_startpos(None)
 317         if self.attribute in attrs and attrs[self.attribute] == self.value:
 318             self.result = [tag]
 319             self.started = True
 320             self.watch_startpos = True
 321         if self.started:
 322             if not tag in self.depth: self.depth[tag] = 0
 323             self.depth[tag] += 1
 324
 325     def handle_endtag(self, tag):
 326         if self.started:
 327             if tag in self.depth: self.depth[tag] -= 1
 328             if self.depth[self.result[0]] == 0:
 329                 self.started = False
 330                 self.result.append(self.getpos())
 331
 332     def find_startpos(self, x):
 333         """Needed to put the start position of the result (self.result[1])
 334         after the opening tag with the requested id"""
 335         if self.watch_startpos:
 336             self.watch_startpos = False
 337             self.result.append(self.getpos())
 338     handle_entityref = handle_charref = handle_data = handle_comment = \
 339     handle_decl = handle_pi = unknown_decl = find_startpos
 340
 341     def get_result(self):
 342         if self.result is None:
 343             return None
 344         if len(self.result) != 3:
 345             return None
 346         lines = self.html.split('\n')
 347         lines = lines[self.result[1][0]-1:self.result[2][0]]
 348         lines[0] = lines[0][self.result[1][1]:]
 349         if len(lines) == 1:
 350             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 351         lines[-1] = lines[-1][:self.result[2][1]]
 352         return '\n'.join(lines).strip()
 353 # Hack for https://github.com/rg3/youtube-dl/issues/662
 354 if sys.version_info < (2, 7, 3):
 355     AttrParser.parse_endtag = (lambda self, i:
 356         i + len("</scr'+'ipt>")
 357         if self.rawdata[i:].startswith("</scr'+'ipt>")
 358         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 359
 360 def get_element_by_id(id, html):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute("id", id, html)
 363
 364 def get_element_by_attribute(attribute, value, html):
 365     """Return the content of the tag with the specified attribute in the passed HTML document"""
 366     parser = AttrParser(attribute, value)
 367     try:
 368         parser.loads(html)
 369     except compat_html_parser.HTMLParseError:
 370         pass
 371     return parser.get_result()
 372
 373 class MetaParser(BaseHTMLParser):
 374     """
 375     Modified HTMLParser that isolates a meta tag with the specified name
 376     attribute.
 377     """
 378     def __init__(self, name):
 379         BaseHTMLParser.__init__(self)
 380         self.name = name
 381         self.content = None
 382         self.result = None
 383
 384     def handle_starttag(self, tag, attrs):
 385         if tag != 'meta':
 386             return
 387         attrs = dict(attrs)
 388         if attrs.get('name') == self.name:
 389             self.result = attrs.get('content')
 390
 391     def get_result(self):
 392         return self.result
 393
 394 def get_meta_content(name, html):
 395     """
 396     Return the content attribute from the meta tag with the given name attribute.
 397     """
 398     parser = MetaParser(name)
 399     try:
 400         parser.loads(html)
 401     except compat_html_parser.HTMLParseError:
 402         pass
 403     return parser.get_result()
 404
 405
 406 def clean_html(html):
 407     """Clean an HTML snippet into a readable string"""
 408     # Newline vs <br />
 409     html = html.replace('\n', ' ')
 410     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 411     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 412     # Strip html tags
 413     html = re.sub('<.*?>', '', html)
 414     # Replace html entities
 415     html = unescapeHTML(html)
 416     return html.strip()
 417
 418
 419 def sanitize_open(filename, open_mode):
 420     """Try to open the given filename, and slightly tweak it if this fails.
 421
 422     Attempts to open the given filename. If this fails, it tries to change
 423     the filename slightly, step by step, until it's either able to open it
 424     or it fails and raises a final exception, like the standard open()
 425     function.
 426
 427     It returns the tuple (stream, definitive_file_name).
 428     """
 429     try:
 430         if filename == u'-':
 431             if sys.platform == 'win32':
 432                 import msvcrt
 433                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 434             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 435         stream = open(encodeFilename(filename), open_mode)
 436         return (stream, filename)
 437     except (IOError, OSError) as err:
 438         if err.errno in (errno.EACCES,):
 439             raise
 440
 441         # In case of error, try to remove win32 forbidden chars
 442         alt_filename = os.path.join(
 443                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 444                         for path_part in os.path.split(filename)
 445                        )
 446         if alt_filename == filename:
 447             raise
 448         else:
 449             # An exception here should be caught in the caller
 450             stream = open(encodeFilename(filename), open_mode)
 451             return (stream, alt_filename)
 452
 453
 454 def timeconvert(timestr):
 455     """Convert RFC 2822 defined time string into system timestamp"""
 456     timestamp = None
 457     timetuple = email.utils.parsedate_tz(timestr)
 458     if timetuple is not None:
 459         timestamp = email.utils.mktime_tz(timetuple)
 460     return timestamp
 461
 462 def sanitize_filename(s, restricted=False, is_id=False):
 463     """Sanitizes a string so it could be used as part of a filename.
 464     If restricted is set, use a stricter subset of allowed characters.
 465     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 466     """
 467     def replace_insane(char):
 468         if char == '?' or ord(char) < 32 or ord(char) == 127:
 469             return ''
 470         elif char == '"':
 471             return '' if restricted else '\''
 472         elif char == ':':
 473             return '_-' if restricted else ' -'
 474         elif char in '\\/|*<>':
 475             return '_'
 476         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 477             return '_'
 478         if restricted and ord(char) > 127:
 479             return '_'
 480         return char
 481
 482     result = u''.join(map(replace_insane, s))
 483     if not is_id:
 484         while '__' in result:
 485             result = result.replace('__', '_')
 486         result = result.strip('_')
 487         # Common case of "Foreign band name - English song title"
 488         if restricted and result.startswith('-_'):
 489             result = result[2:]
 490         if not result:
 491             result = '_'
 492     return result
 493
 494 def orderedSet(iterable):
 495     """ Remove all duplicates from the input iterable """
 496     res = []
 497     for el in iterable:
 498         if el not in res:
 499             res.append(el)
 500     return res
 501
 502 def unescapeHTML(s):
 503     """
 504     @param s a string
 505     """
 506     assert type(s) == type(u'')
 507
 508     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 509     return result
 510
 511
 512 def encodeFilename(s, for_subprocess=False):
 513     """
 514     @param s The name of the file
 515     """
 516
 517     assert type(s) == compat_str
 518
 519     # Python 3 has a Unicode API
 520     if sys.version_info >= (3, 0):
 521         return s
 522
 523     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 524         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 525         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 526         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 527         if not for_subprocess:
 528             return s
 529         else:
 530             # For subprocess calls, encode with locale encoding
 531             # Refer to http://stackoverflow.com/a/9951851/35070
 532             encoding = preferredencoding()
 533     else:
 534         encoding = sys.getfilesystemencoding()
 535     if encoding is None:
 536         encoding = 'utf-8'
 537     return s.encode(encoding, 'ignore')
 538
 539
 540 def decodeOption(optval):
 541     if optval is None:
 542         return optval
 543     if isinstance(optval, bytes):
 544         optval = optval.decode(preferredencoding())
 545
 546     assert isinstance(optval, compat_str)
 547     return optval
 548
 549 def formatSeconds(secs):
 550     if secs > 3600:
 551         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 552     elif secs > 60:
 553         return '%d:%02d' % (secs // 60, secs % 60)
 554     else:
 555         return '%d' % secs
 556
 557
 558 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 559     if sys.version_info < (3, 2):
 560         import httplib
 561
 562         class HTTPSConnectionV3(httplib.HTTPSConnection):
 563             def __init__(self, *args, **kwargs):
 564                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 565
 566             def connect(self):
 567                 sock = socket.create_connection((self.host, self.port), self.timeout)
 568                 if getattr(self, '_tunnel_host', False):
 569                     self.sock = sock
 570                     self._tunnel()
 571                 try:
 572                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 573                 except ssl.SSLError:
 574                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 575
 576         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 577             def https_open(self, req):
 578                 return self.do_open(HTTPSConnectionV3, req)
 579         return HTTPSHandlerV3(**kwargs)
 580     else:
 581         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 582         context.verify_mode = (ssl.CERT_NONE
 583                                if opts_no_check_certificate
 584                                else ssl.CERT_REQUIRED)
 585         context.set_default_verify_paths()
 586         try:
 587             context.load_default_certs()
 588         except AttributeError:
 589             pass  # Python < 3.4
 590         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 591
 592 class ExtractorError(Exception):
 593     """Error during info extraction."""
 594     def __init__(self, msg, tb=None, expected=False, cause=None):
 595         """ tb, if given, is the original traceback (so that it can be printed out).
 596         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 597         """
 598
 599         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 600             expected = True
 601         if not expected:
 602             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 603         super(ExtractorError, self).__init__(msg)
 604
 605         self.traceback = tb
 606         self.exc_info = sys.exc_info()  # preserve original exception
 607         self.cause = cause
 608
 609     def format_traceback(self):
 610         if self.traceback is None:
 611             return None
 612         return u''.join(traceback.format_tb(self.traceback))
 613
 614
 615 class RegexNotFoundError(ExtractorError):
 616     """Error when a regex didn't match"""
 617     pass
 618
 619
 620 class DownloadError(Exception):
 621     """Download Error exception.
 622
 623     This exception may be thrown by FileDownloader objects if they are not
 624     configured to continue on errors. They will contain the appropriate
 625     error message.
 626     """
 627     def __init__(self, msg, exc_info=None):
 628         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 629         super(DownloadError, self).__init__(msg)
 630         self.exc_info = exc_info
 631
 632
 633 class SameFileError(Exception):
 634     """Same File exception.
 635
 636     This exception will be thrown by FileDownloader objects if they detect
 637     multiple files would have to be downloaded to the same file on disk.
 638     """
 639     pass
 640
 641
 642 class PostProcessingError(Exception):
 643     """Post Processing exception.
 644
 645     This exception may be raised by PostProcessor's .run() method to
 646     indicate an error in the postprocessing task.
 647     """
 648     def __init__(self, msg):
 649         self.msg = msg
 650
 651 class MaxDownloadsReached(Exception):
 652     """ --max-downloads limit has been reached. """
 653     pass
 654
 655
 656 class UnavailableVideoError(Exception):
 657     """Unavailable Format exception.
 658
 659     This exception will be thrown when a video is requested
 660     in a format that is not available for that video.
 661     """
 662     pass
 663
 664
 665 class ContentTooShortError(Exception):
 666     """Content Too Short exception.
 667
 668     This exception may be raised by FileDownloader objects when a file they
 669     download is too small for what the server announced first, indicating
 670     the connection was probably interrupted.
 671     """
 672     # Both in bytes
 673     downloaded = None
 674     expected = None
 675
 676     def __init__(self, downloaded, expected):
 677         self.downloaded = downloaded
 678         self.expected = expected
 679
 680 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 681     """Handler for HTTP requests and responses.
 682
 683     This class, when installed with an OpenerDirector, automatically adds
 684     the standard headers to every HTTP request and handles gzipped and
 685     deflated responses from web servers. If compression is to be avoided in
 686     a particular request, the original request in the program code only has
 687     to include the HTTP header "Youtubedl-No-Compression", which will be
 688     removed before making the real request.
 689
 690     Part of this code was copied from:
 691
 692     http://techknack.net/python-urllib2-handlers/
 693
 694     Andrew Rowls, the author of that code, agreed to release it to the
 695     public domain.
 696     """
 697
 698     @staticmethod
 699     def deflate(data):
 700         try:
 701             return zlib.decompress(data, -zlib.MAX_WBITS)
 702         except zlib.error:
 703             return zlib.decompress(data)
 704
 705     @staticmethod
 706     def addinfourl_wrapper(stream, headers, url, code):
 707         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 708             return compat_urllib_request.addinfourl(stream, headers, url, code)
 709         ret = compat_urllib_request.addinfourl(stream, headers, url)
 710         ret.code = code
 711         return ret
 712
 713     def http_request(self, req):
 714         for h,v in std_headers.items():
 715             if h in req.headers:
 716                 del req.headers[h]
 717             req.add_header(h, v)
 718         if 'Youtubedl-no-compression' in req.headers:
 719             if 'Accept-encoding' in req.headers:
 720                 del req.headers['Accept-encoding']
 721             del req.headers['Youtubedl-no-compression']
 722         if 'Youtubedl-user-agent' in req.headers:
 723             if 'User-agent' in req.headers:
 724                 del req.headers['User-agent']
 725             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 726             del req.headers['Youtubedl-user-agent']
 727         return req
 728
 729     def http_response(self, req, resp):
 730         old_resp = resp
 731         # gzip
 732         if resp.headers.get('Content-encoding', '') == 'gzip':
 733             content = resp.read()
 734             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 735             try:
 736                 uncompressed = io.BytesIO(gz.read())
 737             except IOError as original_ioerror:
 738                 # There may be junk add the end of the file
 739                 # See http://stackoverflow.com/q/4928560/35070 for details
 740                 for i in range(1, 1024):
 741                     try:
 742                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 743                         uncompressed = io.BytesIO(gz.read())
 744                     except IOError:
 745                         continue
 746                     break
 747                 else:
 748                     raise original_ioerror
 749             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 750             resp.msg = old_resp.msg
 751         # deflate
 752         if resp.headers.get('Content-encoding', '') == 'deflate':
 753             gz = io.BytesIO(self.deflate(resp.read()))
 754             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 755             resp.msg = old_resp.msg
 756         return resp
 757
 758     https_request = http_request
 759     https_response = http_response
 760
 761
 762 def unified_strdate(date_str):
 763     """Return a string with the date in the format YYYYMMDD"""
 764     upload_date = None
 765     #Replace commas
 766     date_str = date_str.replace(',', ' ')
 767     # %z (UTC offset) is only supported in python>=3.2
 768     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 769     format_expressions = [
 770         '%d %B %Y',
 771         '%d %b %Y',
 772         '%B %d %Y',
 773         '%b %d %Y',
 774         '%Y-%m-%d',
 775         '%d/%m/%Y',
 776         '%Y/%m/%d %H:%M:%S',
 777         '%Y-%m-%d %H:%M:%S',
 778         '%d.%m.%Y %H:%M',
 779         '%Y-%m-%dT%H:%M:%SZ',
 780         '%Y-%m-%dT%H:%M:%S.%fZ',
 781         '%Y-%m-%dT%H:%M:%S.%f0Z',
 782         '%Y-%m-%dT%H:%M:%S',
 783         '%Y-%m-%dT%H:%M:%S.%f',
 784         '%Y-%m-%dT%H:%M',
 785     ]
 786     for expression in format_expressions:
 787         try:
 788             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 789         except ValueError:
 790             pass
 791     if upload_date is None:
 792         timetuple = email.utils.parsedate_tz(date_str)
 793         if timetuple:
 794             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 795     return upload_date
 796
 797 def determine_ext(url, default_ext=u'unknown_video'):
 798     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 799     if re.match(r'^[A-Za-z0-9]+$', guess):
 800         return guess
 801     else:
 802         return default_ext
 803
 804 def subtitles_filename(filename, sub_lang, sub_format):
 805     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 806
 807 def date_from_str(date_str):
 808     """
 809     Return a datetime object from a string in the format YYYYMMDD or
 810     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 811     today = datetime.date.today()
 812     if date_str == 'now'or date_str == 'today':
 813         return today
 814     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 815     if match is not None:
 816         sign = match.group('sign')
 817         time = int(match.group('time'))
 818         if sign == '-':
 819             time = -time
 820         unit = match.group('unit')
 821         #A bad aproximation?
 822         if unit == 'month':
 823             unit = 'day'
 824             time *= 30
 825         elif unit == 'year':
 826             unit = 'day'
 827             time *= 365
 828         unit += 's'
 829         delta = datetime.timedelta(**{unit: time})
 830         return today + delta
 831     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 832
 833 def hyphenate_date(date_str):
 834     """
 835     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 836     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 837     if match is not None:
 838         return '-'.join(match.groups())
 839     else:
 840         return date_str
 841
 842 class DateRange(object):
 843     """Represents a time interval between two dates"""
 844     def __init__(self, start=None, end=None):
 845         """start and end must be strings in the format accepted by date"""
 846         if start is not None:
 847             self.start = date_from_str(start)
 848         else:
 849             self.start = datetime.datetime.min.date()
 850         if end is not None:
 851             self.end = date_from_str(end)
 852         else:
 853             self.end = datetime.datetime.max.date()
 854         if self.start > self.end:
 855             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 856     @classmethod
 857     def day(cls, day):
 858         """Returns a range that only contains the given day"""
 859         return cls(day,day)
 860     def __contains__(self, date):
 861         """Check if the date is in the range"""
 862         if not isinstance(date, datetime.date):
 863             date = date_from_str(date)
 864         return self.start <= date <= self.end
 865     def __str__(self):
 866         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 867
 868
 869 def platform_name():
 870     """ Returns the platform name as a compat_str """
 871     res = platform.platform()
 872     if isinstance(res, bytes):
 873         res = res.decode(preferredencoding())
 874
 875     assert isinstance(res, compat_str)
 876     return res
 877
 878
 879 def write_string(s, out=None):
 880     if out is None:
 881         out = sys.stderr
 882     assert type(s) == compat_str
 883
 884     if ('b' in getattr(out, 'mode', '') or
 885             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 886         s = s.encode(preferredencoding(), 'ignore')
 887     try:
 888         out.write(s)
 889     except UnicodeEncodeError:
 890         # In Windows shells, this can fail even when the codec is just charmap!?
 891         # See https://wiki.python.org/moin/PrintFails#Issue
 892         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 893             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 894             out.write(s)
 895         else:
 896             raise
 897
 898     out.flush()
 899
 900
 901 def bytes_to_intlist(bs):
 902     if not bs:
 903         return []
 904     if isinstance(bs[0], int):  # Python 3
 905         return list(bs)
 906     else:
 907         return [ord(c) for c in bs]
 908
 909
 910 def intlist_to_bytes(xs):
 911     if not xs:
 912         return b''
 913     if isinstance(chr(0), bytes):  # Python 2
 914         return ''.join([chr(x) for x in xs])
 915     else:
 916         return bytes(xs)
 917
 918
 919 def get_cachedir(params={}):
 920     cache_root = os.environ.get('XDG_CACHE_HOME',
 921                                 os.path.expanduser('~/.cache'))
 922     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 923
 924
 925 # Cross-platform file locking
 926 if sys.platform == 'win32':
 927     import ctypes.wintypes
 928     import msvcrt
 929
 930     class OVERLAPPED(ctypes.Structure):
 931         _fields_ = [
 932             ('Internal', ctypes.wintypes.LPVOID),
 933             ('InternalHigh', ctypes.wintypes.LPVOID),
 934             ('Offset', ctypes.wintypes.DWORD),
 935             ('OffsetHigh', ctypes.wintypes.DWORD),
 936             ('hEvent', ctypes.wintypes.HANDLE),
 937         ]
 938
 939     kernel32 = ctypes.windll.kernel32
 940     LockFileEx = kernel32.LockFileEx
 941     LockFileEx.argtypes = [
 942         ctypes.wintypes.HANDLE,     # hFile
 943         ctypes.wintypes.DWORD,      # dwFlags
 944         ctypes.wintypes.DWORD,      # dwReserved
 945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 946         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 947         ctypes.POINTER(OVERLAPPED)  # Overlapped
 948     ]
 949     LockFileEx.restype = ctypes.wintypes.BOOL
 950     UnlockFileEx = kernel32.UnlockFileEx
 951     UnlockFileEx.argtypes = [
 952         ctypes.wintypes.HANDLE,     # hFile
 953         ctypes.wintypes.DWORD,      # dwReserved
 954         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 955         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 956         ctypes.POINTER(OVERLAPPED)  # Overlapped
 957     ]
 958     UnlockFileEx.restype = ctypes.wintypes.BOOL
 959     whole_low = 0xffffffff
 960     whole_high = 0x7fffffff
 961
 962     def _lock_file(f, exclusive):
 963         overlapped = OVERLAPPED()
 964         overlapped.Offset = 0
 965         overlapped.OffsetHigh = 0
 966         overlapped.hEvent = 0
 967         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 968         handle = msvcrt.get_osfhandle(f.fileno())
 969         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 970                           whole_low, whole_high, f._lock_file_overlapped_p):
 971             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 972
 973     def _unlock_file(f):
 974         assert f._lock_file_overlapped_p
 975         handle = msvcrt.get_osfhandle(f.fileno())
 976         if not UnlockFileEx(handle, 0,
 977                             whole_low, whole_high, f._lock_file_overlapped_p):
 978             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 979
 980 else:
 981     import fcntl
 982
 983     def _lock_file(f, exclusive):
 984         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 985
 986     def _unlock_file(f):
 987         fcntl.lockf(f, fcntl.LOCK_UN)
 988
 989
 990 class locked_file(object):
 991     def __init__(self, filename, mode, encoding=None):
 992         assert mode in ['r', 'a', 'w']
 993         self.f = io.open(filename, mode, encoding=encoding)
 994         self.mode = mode
 995
 996     def __enter__(self):
 997         exclusive = self.mode != 'r'
 998         try:
 999             _lock_file(self.f, exclusive)
1000         except IOError:
1001             self.f.close()
1002             raise
1003         return self
1004
1005     def __exit__(self, etype, value, traceback):
1006         try:
1007             _unlock_file(self.f)
1008         finally:
1009             self.f.close()
1010
1011     def __iter__(self):
1012         return iter(self.f)
1013
1014     def write(self, *args):
1015         return self.f.write(*args)
1016
1017     def read(self, *args):
1018         return self.f.read(*args)
1019
1020
1021 def shell_quote(args):
1022     quoted_args = []
1023     encoding = sys.getfilesystemencoding()
1024     if encoding is None:
1025         encoding = 'utf-8'
1026     for a in args:
1027         if isinstance(a, bytes):
1028             # We may get a filename encoded with 'encodeFilename'
1029             a = a.decode(encoding)
1030         quoted_args.append(pipes.quote(a))
1031     return u' '.join(quoted_args)
1032
1033
1034 def takewhile_inclusive(pred, seq):
1035     """ Like itertools.takewhile, but include the latest evaluated element
1036         (the first element so that Not pred(e)) """
1037     for e in seq:
1038         yield e
1039         if not pred(e):
1040             return
1041
1042
1043 def smuggle_url(url, data):
1044     """ Pass additional data in a URL for internal use. """
1045
1046     sdata = compat_urllib_parse.urlencode(
1047         {u'__youtubedl_smuggle': json.dumps(data)})
1048     return url + u'#' + sdata
1049
1050
1051 def unsmuggle_url(smug_url, default=None):
1052     if not '#__youtubedl_smuggle' in smug_url:
1053         return smug_url, default
1054     url, _, sdata = smug_url.rpartition(u'#')
1055     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1056     data = json.loads(jsond)
1057     return url, data
1058
1059
1060 def format_bytes(bytes):
1061     if bytes is None:
1062         return u'N/A'
1063     if type(bytes) is str:
1064         bytes = float(bytes)
1065     if bytes == 0.0:
1066         exponent = 0
1067     else:
1068         exponent = int(math.log(bytes, 1024.0))
1069     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1070     converted = float(bytes) / float(1024 ** exponent)
1071     return u'%.2f%s' % (converted, suffix)
1072
1073
1074 def str_to_int(int_str):
1075     int_str = re.sub(r'[,\.]', u'', int_str)
1076     return int(int_str)
1077
1078
1079 def get_term_width():
1080     columns = os.environ.get('COLUMNS', None)
1081     if columns:
1082         return int(columns)
1083
1084     try:
1085         sp = subprocess.Popen(
1086             ['stty', 'size'],
1087             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1088         out, err = sp.communicate()
1089         return int(out.split()[1])
1090     except:
1091         pass
1092     return None
1093
1094
1095 def month_by_name(name):
1096     """ Return the number of a month by (locale-independently) English name """
1097
1098     ENGLISH_NAMES = [
1099         u'January', u'February', u'March', u'April', u'May', u'June',
1100         u'July', u'August', u'September', u'October', u'November', u'December']
1101     try:
1102         return ENGLISH_NAMES.index(name) + 1
1103     except ValueError:
1104         return None
1105
1106
1107 def fix_xml_ampersands(xml_str):
1108     """Replace all the '&' by '&amp;' in XML"""
1109     return re.sub(
1110         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1111         u'&amp;',
1112         xml_str)
1113
1114
1115 def setproctitle(title):
1116     assert isinstance(title, compat_str)
1117     try:
1118         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1119     except OSError:
1120         return
1121     title = title
1122     buf = ctypes.create_string_buffer(len(title) + 1)
1123     buf.value = title.encode('utf-8')
1124     try:
1125         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1126     except AttributeError:
1127         return  # Strange libc, just skip this
1128
1129
1130 def remove_start(s, start):
1131     if s.startswith(start):
1132         return s[len(start):]
1133     return s
1134
1135
1136 def url_basename(url):
1137     path = compat_urlparse.urlparse(url).path
1138     return path.strip(u'/').split(u'/')[-1]
1139
1140
1141 class HEADRequest(compat_urllib_request.Request):
1142     def get_method(self):
1143         return "HEAD"
1144
1145
1146 def int_or_none(v, scale=1):
1147     return v if v is None else (int(v) // scale)
1148
1149
1150 def parse_duration(s):
1151     if s is None:
1152         return None
1153
1154     m = re.match(
1155         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1156     if not m:
1157         return None
1158     res = int(m.group('secs'))
1159     if m.group('mins'):
1160         res += int(m.group('mins')) * 60
1161         if m.group('hours'):
1162             res += int(m.group('hours')) * 60 * 60
1163     return res
1164
1165
1166 def prepend_extension(filename, ext):
1167     name, real_ext = os.path.splitext(filename)
1168     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1169
1170
1171 def check_executable(exe, args=[]):
1172     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1173     args can be a list of arguments for a short output (like -version) """
1174     try:
1175         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1176     except OSError:
1177         return False
1178     return exe
1179
1180
1181 class PagedList(object):
1182     def __init__(self, pagefunc, pagesize):
1183         self._pagefunc = pagefunc
1184         self._pagesize = pagesize
1185
1186     def __len__(self):
1187         # This is only useful for tests
1188         return len(self.getslice())
1189
1190     def getslice(self, start=0, end=None):
1191         res = []
1192         for pagenum in itertools.count(start // self._pagesize):
1193             firstid = pagenum * self._pagesize
1194             nextfirstid = pagenum * self._pagesize + self._pagesize
1195             if start >= nextfirstid:
1196                 continue
1197
1198             page_results = list(self._pagefunc(pagenum))
1199
1200             startv = (
1201                 start % self._pagesize
1202                 if firstid <= start < nextfirstid
1203                 else 0)
1204
1205             endv = (
1206                 ((end - 1) % self._pagesize) + 1
1207                 if (end is not None and firstid <= end <= nextfirstid)
1208                 else None)
1209
1210             if startv != 0 or endv is not None:
1211                 page_results = page_results[startv:endv]
1212             res.extend(page_results)
1213
1214             # A little optimization - if current page is not "full", ie. does
1215             # not contain page_size videos then we can assume that this page
1216             # is the last one - there are no more ids on further pages -
1217             # i.e. no need to query again.
1218             if len(page_results) + startv < self._pagesize:
1219                 break
1220
1221             # If we got the whole page, but the next page is not interesting,
1222             # break out early as well
1223             if end == nextfirstid:
1224                 break
1225         return res
1226
1227
1228 def uppercase_escape(s):
1229     return re.sub(
1230         r'\\U([0-9a-fA-F]{8})',
1231         lambda m: compat_chr(int(m.group(1), base=16)), s)
1232
1233 try:
1234     struct.pack(u'!I', 0)
1235 except TypeError:
1236     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1237     def struct_pack(spec, *args):
1238         if isinstance(spec, compat_str):
1239             spec = spec.encode('ascii')
1240         return struct.pack(spec, *args)
1241
1242     def struct_unpack(spec, *args):
1243         if isinstance(spec, compat_str):
1244             spec = spec.encode('ascii')
1245         return struct.unpack(spec, *args)
1246 else:
1247     struct_pack = struct.pack
1248     struct_unpack = struct.unpack
1249
1250
1251 def read_batch_urls(batch_fd):
1252     def fixup(url):
1253         if not isinstance(url, compat_str):
1254             url = url.decode('utf-8', 'replace')
1255         BOM_UTF8 = u'\xef\xbb\xbf'
1256         if url.startswith(BOM_UTF8):
1257             url = url[len(BOM_UTF8):]
1258         url = url.strip()
1259         if url.startswith(('#', ';', ']')):
1260             return False
1261         return url
1262
1263     with contextlib.closing(batch_fd) as fd:
1264         return [url for url in map(fixup, fd) if url]