youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import contextlib
   5 import ctypes
   6 import datetime
   7 import email.utils
   8 import errno
   9 import gzip
  10 import itertools
  11 import io
  12 import json
  13 import locale
  14 import math
  15 import os
  16 import pipes
  17 import platform
  18 import re
  19 import ssl
  20 import socket
  21 import struct
  22 import subprocess
  23 import sys
  24 import traceback
  25 import zlib
  26
  27 try:
  28     import urllib.request as compat_urllib_request
  29 except ImportError: # Python 2
  30     import urllib2 as compat_urllib_request
  31
  32 try:
  33     import urllib.error as compat_urllib_error
  34 except ImportError: # Python 2
  35     import urllib2 as compat_urllib_error
  36
  37 try:
  38     import urllib.parse as compat_urllib_parse
  39 except ImportError: # Python 2
  40     import urllib as compat_urllib_parse
  41
  42 try:
  43     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  44 except ImportError: # Python 2
  45     from urlparse import urlparse as compat_urllib_parse_urlparse
  46
  47 try:
  48     import urllib.parse as compat_urlparse
  49 except ImportError: # Python 2
  50     import urlparse as compat_urlparse
  51
  52 try:
  53     import http.cookiejar as compat_cookiejar
  54 except ImportError: # Python 2
  55     import cookielib as compat_cookiejar
  56
  57 try:
  58     import html.entities as compat_html_entities
  59 except ImportError: # Python 2
  60     import htmlentitydefs as compat_html_entities
  61
  62 try:
  63     import html.parser as compat_html_parser
  64 except ImportError: # Python 2
  65     import HTMLParser as compat_html_parser
  66
  67 try:
  68     import http.client as compat_http_client
  69 except ImportError: # Python 2
  70     import httplib as compat_http_client
  71
  72 try:
  73     from urllib.error import HTTPError as compat_HTTPError
  74 except ImportError:  # Python 2
  75     from urllib2 import HTTPError as compat_HTTPError
  76
  77 try:
  78     from urllib.request import urlretrieve as compat_urlretrieve
  79 except ImportError:  # Python 2
  80     from urllib import urlretrieve as compat_urlretrieve
  81
  82
  83 try:
  84     from subprocess import DEVNULL
  85     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  86 except ImportError:
  87     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  88
  89 try:
  90     from urllib.parse import parse_qs as compat_parse_qs
  91 except ImportError: # Python 2
  92     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  93     # Python 2's version is apparently totally broken
  94     def _unquote(string, encoding='utf-8', errors='replace'):
  95         if string == '':
  96             return string
  97         res = string.split('%')
  98         if len(res) == 1:
  99             return string
 100         if encoding is None:
 101             encoding = 'utf-8'
 102         if errors is None:
 103             errors = 'replace'
 104         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 105         pct_sequence = b''
 106         string = res[0]
 107         for item in res[1:]:
 108             try:
 109                 if not item:
 110                     raise ValueError
 111                 pct_sequence += item[:2].decode('hex')
 112                 rest = item[2:]
 113                 if not rest:
 114                     # This segment was just a single percent-encoded character.
 115                     # May be part of a sequence of code units, so delay decoding.
 116                     # (Stored in pct_sequence).
 117                     continue
 118             except ValueError:
 119                 rest = '%' + item
 120             # Encountered non-percent-encoded characters. Flush the current
 121             # pct_sequence.
 122             string += pct_sequence.decode(encoding, errors) + rest
 123             pct_sequence = b''
 124         if pct_sequence:
 125             # Flush the final pct_sequence
 126             string += pct_sequence.decode(encoding, errors)
 127         return string
 128
 129     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 130                 encoding='utf-8', errors='replace'):
 131         qs, _coerce_result = qs, unicode
 132         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 133         r = []
 134         for name_value in pairs:
 135             if not name_value and not strict_parsing:
 136                 continue
 137             nv = name_value.split('=', 1)
 138             if len(nv) != 2:
 139                 if strict_parsing:
 140                     raise ValueError("bad query field: %r" % (name_value,))
 141                 # Handle case of a control-name with no equal sign
 142                 if keep_blank_values:
 143                     nv.append('')
 144                 else:
 145                     continue
 146             if len(nv[1]) or keep_blank_values:
 147                 name = nv[0].replace('+', ' ')
 148                 name = _unquote(name, encoding=encoding, errors=errors)
 149                 name = _coerce_result(name)
 150                 value = nv[1].replace('+', ' ')
 151                 value = _unquote(value, encoding=encoding, errors=errors)
 152                 value = _coerce_result(value)
 153                 r.append((name, value))
 154         return r
 155
 156     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 157                 encoding='utf-8', errors='replace'):
 158         parsed_result = {}
 159         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 160                         encoding=encoding, errors=errors)
 161         for name, value in pairs:
 162             if name in parsed_result:
 163                 parsed_result[name].append(value)
 164             else:
 165                 parsed_result[name] = [value]
 166         return parsed_result
 167
 168 try:
 169     compat_str = unicode # Python 2
 170 except NameError:
 171     compat_str = str
 172
 173 try:
 174     compat_chr = unichr # Python 2
 175 except NameError:
 176     compat_chr = chr
 177
 178 try:
 179     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 180 except ImportError:  # Python 2.6
 181     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 182
 183 def compat_ord(c):
 184     if type(c) is int: return c
 185     else: return ord(c)
 186
 187 # This is not clearly defined otherwise
 188 compiled_regex_type = type(re.compile(''))
 189
 190 std_headers = {
 191     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 192     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 193     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 194     'Accept-Encoding': 'gzip, deflate',
 195     'Accept-Language': 'en-us,en;q=0.5',
 196 }
 197
 198 def preferredencoding():
 199     """Get preferred encoding.
 200
 201     Returns the best encoding scheme for the system, based on
 202     locale.getpreferredencoding() and some further tweaks.
 203     """
 204     try:
 205         pref = locale.getpreferredencoding()
 206         u'TEST'.encode(pref)
 207     except:
 208         pref = 'UTF-8'
 209
 210     return pref
 211
 212 if sys.version_info < (3,0):
 213     def compat_print(s):
 214         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 215 else:
 216     def compat_print(s):
 217         assert type(s) == type(u'')
 218         print(s)
 219
 220 # In Python 2.x, json.dump expects a bytestream.
 221 # In Python 3.x, it writes to a character stream
 222 if sys.version_info < (3,0):
 223     def write_json_file(obj, fn):
 224         with open(fn, 'wb') as f:
 225             json.dump(obj, f)
 226 else:
 227     def write_json_file(obj, fn):
 228         with open(fn, 'w', encoding='utf-8') as f:
 229             json.dump(obj, f)
 230
 231 if sys.version_info >= (2,7):
 232     def find_xpath_attr(node, xpath, key, val):
 233         """ Find the xpath xpath[@key=val] """
 234         assert re.match(r'^[a-zA-Z]+$', key)
 235         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 236         expr = xpath + u"[@%s='%s']" % (key, val)
 237         return node.find(expr)
 238 else:
 239     def find_xpath_attr(node, xpath, key, val):
 240         for f in node.findall(xpath):
 241             if f.attrib.get(key) == val:
 242                 return f
 243         return None
 244
 245 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 246 # the namespace parameter
 247 def xpath_with_ns(path, ns_map):
 248     components = [c.split(':') for c in path.split('/')]
 249     replaced = []
 250     for c in components:
 251         if len(c) == 1:
 252             replaced.append(c[0])
 253         else:
 254             ns, tag = c
 255             replaced.append('{%s}%s' % (ns_map[ns], tag))
 256     return '/'.join(replaced)
 257
 258 def htmlentity_transform(matchobj):
 259     """Transforms an HTML entity to a character.
 260
 261     This function receives a match object and is intended to be used with
 262     the re.sub() function.
 263     """
 264     entity = matchobj.group(1)
 265
 266     # Known non-numeric HTML entity
 267     if entity in compat_html_entities.name2codepoint:
 268         return compat_chr(compat_html_entities.name2codepoint[entity])
 269
 270     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 271     if mobj is not None:
 272         numstr = mobj.group(1)
 273         if numstr.startswith(u'x'):
 274             base = 16
 275             numstr = u'0%s' % numstr
 276         else:
 277             base = 10
 278         return compat_chr(int(numstr, base))
 279
 280     # Unknown entity in name, return its literal representation
 281     return (u'&%s;' % entity)
 282
 283 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 284 class BaseHTMLParser(compat_html_parser.HTMLParser):
 285     def __init(self):
 286         compat_html_parser.HTMLParser.__init__(self)
 287         self.html = None
 288
 289     def loads(self, html):
 290         self.html = html
 291         self.feed(html)
 292         self.close()
 293
 294 class AttrParser(BaseHTMLParser):
 295     """Modified HTMLParser that isolates a tag with the specified attribute"""
 296     def __init__(self, attribute, value):
 297         self.attribute = attribute
 298         self.value = value
 299         self.result = None
 300         self.started = False
 301         self.depth = {}
 302         self.watch_startpos = False
 303         self.error_count = 0
 304         BaseHTMLParser.__init__(self)
 305
 306     def error(self, message):
 307         if self.error_count > 10 or self.started:
 308             raise compat_html_parser.HTMLParseError(message, self.getpos())
 309         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 310         self.error_count += 1
 311         self.goahead(1)
 312
 313     def handle_starttag(self, tag, attrs):
 314         attrs = dict(attrs)
 315         if self.started:
 316             self.find_startpos(None)
 317         if self.attribute in attrs and attrs[self.attribute] == self.value:
 318             self.result = [tag]
 319             self.started = True
 320             self.watch_startpos = True
 321         if self.started:
 322             if not tag in self.depth: self.depth[tag] = 0
 323             self.depth[tag] += 1
 324
 325     def handle_endtag(self, tag):
 326         if self.started:
 327             if tag in self.depth: self.depth[tag] -= 1
 328             if self.depth[self.result[0]] == 0:
 329                 self.started = False
 330                 self.result.append(self.getpos())
 331
 332     def find_startpos(self, x):
 333         """Needed to put the start position of the result (self.result[1])
 334         after the opening tag with the requested id"""
 335         if self.watch_startpos:
 336             self.watch_startpos = False
 337             self.result.append(self.getpos())
 338     handle_entityref = handle_charref = handle_data = handle_comment = \
 339     handle_decl = handle_pi = unknown_decl = find_startpos
 340
 341     def get_result(self):
 342         if self.result is None:
 343             return None
 344         if len(self.result) != 3:
 345             return None
 346         lines = self.html.split('\n')
 347         lines = lines[self.result[1][0]-1:self.result[2][0]]
 348         lines[0] = lines[0][self.result[1][1]:]
 349         if len(lines) == 1:
 350             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 351         lines[-1] = lines[-1][:self.result[2][1]]
 352         return '\n'.join(lines).strip()
 353 # Hack for https://github.com/rg3/youtube-dl/issues/662
 354 if sys.version_info < (2, 7, 3):
 355     AttrParser.parse_endtag = (lambda self, i:
 356         i + len("</scr'+'ipt>")
 357         if self.rawdata[i:].startswith("</scr'+'ipt>")
 358         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 359
 360 def get_element_by_id(id, html):
 361     """Return the content of the tag with the specified ID in the passed HTML document"""
 362     return get_element_by_attribute("id", id, html)
 363
 364 def get_element_by_attribute(attribute, value, html):
 365     """Return the content of the tag with the specified attribute in the passed HTML document"""
 366     parser = AttrParser(attribute, value)
 367     try:
 368         parser.loads(html)
 369     except compat_html_parser.HTMLParseError:
 370         pass
 371     return parser.get_result()
 372
 373 class MetaParser(BaseHTMLParser):
 374     """
 375     Modified HTMLParser that isolates a meta tag with the specified name
 376     attribute.
 377     """
 378     def __init__(self, name):
 379         BaseHTMLParser.__init__(self)
 380         self.name = name
 381         self.content = None
 382         self.result = None
 383
 384     def handle_starttag(self, tag, attrs):
 385         if tag != 'meta':
 386             return
 387         attrs = dict(attrs)
 388         if attrs.get('name') == self.name:
 389             self.result = attrs.get('content')
 390
 391     def get_result(self):
 392         return self.result
 393
 394 def get_meta_content(name, html):
 395     """
 396     Return the content attribute from the meta tag with the given name attribute.
 397     """
 398     parser = MetaParser(name)
 399     try:
 400         parser.loads(html)
 401     except compat_html_parser.HTMLParseError:
 402         pass
 403     return parser.get_result()
 404
 405
 406 def clean_html(html):
 407     """Clean an HTML snippet into a readable string"""
 408     # Newline vs <br />
 409     html = html.replace('\n', ' ')
 410     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 411     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 412     # Strip html tags
 413     html = re.sub('<.*?>', '', html)
 414     # Replace html entities
 415     html = unescapeHTML(html)
 416     return html.strip()
 417
 418
 419 def sanitize_open(filename, open_mode):
 420     """Try to open the given filename, and slightly tweak it if this fails.
 421
 422     Attempts to open the given filename. If this fails, it tries to change
 423     the filename slightly, step by step, until it's either able to open it
 424     or it fails and raises a final exception, like the standard open()
 425     function.
 426
 427     It returns the tuple (stream, definitive_file_name).
 428     """
 429     try:
 430         if filename == u'-':
 431             if sys.platform == 'win32':
 432                 import msvcrt
 433                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 434             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 435         stream = open(encodeFilename(filename), open_mode)
 436         return (stream, filename)
 437     except (IOError, OSError) as err:
 438         if err.errno in (errno.EACCES,):
 439             raise
 440
 441         # In case of error, try to remove win32 forbidden chars
 442         alt_filename = os.path.join(
 443                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 444                         for path_part in os.path.split(filename)
 445                        )
 446         if alt_filename == filename:
 447             raise
 448         else:
 449             # An exception here should be caught in the caller
 450             stream = open(encodeFilename(filename), open_mode)
 451             return (stream, alt_filename)
 452
 453
 454 def timeconvert(timestr):
 455     """Convert RFC 2822 defined time string into system timestamp"""
 456     timestamp = None
 457     timetuple = email.utils.parsedate_tz(timestr)
 458     if timetuple is not None:
 459         timestamp = email.utils.mktime_tz(timetuple)
 460     return timestamp
 461
 462 def sanitize_filename(s, restricted=False, is_id=False):
 463     """Sanitizes a string so it could be used as part of a filename.
 464     If restricted is set, use a stricter subset of allowed characters.
 465     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 466     """
 467     def replace_insane(char):
 468         if char == '?' or ord(char) < 32 or ord(char) == 127:
 469             return ''
 470         elif char == '"':
 471             return '' if restricted else '\''
 472         elif char == ':':
 473             return '_-' if restricted else ' -'
 474         elif char in '\\/|*<>':
 475             return '_'
 476         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 477             return '_'
 478         if restricted and ord(char) > 127:
 479             return '_'
 480         return char
 481
 482     result = u''.join(map(replace_insane, s))
 483     if not is_id:
 484         while '__' in result:
 485             result = result.replace('__', '_')
 486         result = result.strip('_')
 487         # Common case of "Foreign band name - English song title"
 488         if restricted and result.startswith('-_'):
 489             result = result[2:]
 490         if not result:
 491             result = '_'
 492     return result
 493
 494 def orderedSet(iterable):
 495     """ Remove all duplicates from the input iterable """
 496     res = []
 497     for el in iterable:
 498         if el not in res:
 499             res.append(el)
 500     return res
 501
 502 def unescapeHTML(s):
 503     """
 504     @param s a string
 505     """
 506     assert type(s) == type(u'')
 507
 508     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 509     return result
 510
 511
 512 def encodeFilename(s, for_subprocess=False):
 513     """
 514     @param s The name of the file
 515     """
 516
 517     assert type(s) == compat_str
 518
 519     # Python 3 has a Unicode API
 520     if sys.version_info >= (3, 0):
 521         return s
 522
 523     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 524         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 525         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 526         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 527         if not for_subprocess:
 528             return s
 529         else:
 530             # For subprocess calls, encode with locale encoding
 531             # Refer to http://stackoverflow.com/a/9951851/35070
 532             encoding = preferredencoding()
 533     else:
 534         encoding = sys.getfilesystemencoding()
 535     if encoding is None:
 536         encoding = 'utf-8'
 537     return s.encode(encoding, 'ignore')
 538
 539
 540 def decodeOption(optval):
 541     if optval is None:
 542         return optval
 543     if isinstance(optval, bytes):
 544         optval = optval.decode(preferredencoding())
 545
 546     assert isinstance(optval, compat_str)
 547     return optval
 548
 549 def formatSeconds(secs):
 550     if secs > 3600:
 551         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 552     elif secs > 60:
 553         return '%d:%02d' % (secs // 60, secs % 60)
 554     else:
 555         return '%d' % secs
 556
 557
 558 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 559     if sys.version_info < (3, 2):
 560         import httplib
 561
 562         class HTTPSConnectionV3(httplib.HTTPSConnection):
 563             def __init__(self, *args, **kwargs):
 564                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 565
 566             def connect(self):
 567                 sock = socket.create_connection((self.host, self.port), self.timeout)
 568                 if getattr(self, '_tunnel_host', False):
 569                     self.sock = sock
 570                     self._tunnel()
 571                 try:
 572                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 573                 except ssl.SSLError:
 574                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 575
 576         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 577             def https_open(self, req):
 578                 return self.do_open(HTTPSConnectionV3, req)
 579         return HTTPSHandlerV3(**kwargs)
 580     else:
 581         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 582         context.verify_mode = (ssl.CERT_NONE
 583                                if opts_no_check_certificate
 584                                else ssl.CERT_REQUIRED)
 585         context.set_default_verify_paths()
 586         try:
 587             context.load_default_certs()
 588         except AttributeError:
 589             pass  # Python < 3.4
 590         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 591
 592 class ExtractorError(Exception):
 593     """Error during info extraction."""
 594     def __init__(self, msg, tb=None, expected=False, cause=None):
 595         """ tb, if given, is the original traceback (so that it can be printed out).
 596         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 597         """
 598
 599         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 600             expected = True
 601         if not expected:
 602             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 603         super(ExtractorError, self).__init__(msg)
 604
 605         self.traceback = tb
 606         self.exc_info = sys.exc_info()  # preserve original exception
 607         self.cause = cause
 608
 609     def format_traceback(self):
 610         if self.traceback is None:
 611             return None
 612         return u''.join(traceback.format_tb(self.traceback))
 613
 614
 615 class RegexNotFoundError(ExtractorError):
 616     """Error when a regex didn't match"""
 617     pass
 618
 619
 620 class DownloadError(Exception):
 621     """Download Error exception.
 622
 623     This exception may be thrown by FileDownloader objects if they are not
 624     configured to continue on errors. They will contain the appropriate
 625     error message.
 626     """
 627     def __init__(self, msg, exc_info=None):
 628         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 629         super(DownloadError, self).__init__(msg)
 630         self.exc_info = exc_info
 631
 632
 633 class SameFileError(Exception):
 634     """Same File exception.
 635
 636     This exception will be thrown by FileDownloader objects if they detect
 637     multiple files would have to be downloaded to the same file on disk.
 638     """
 639     pass
 640
 641
 642 class PostProcessingError(Exception):
 643     """Post Processing exception.
 644
 645     This exception may be raised by PostProcessor's .run() method to
 646     indicate an error in the postprocessing task.
 647     """
 648     def __init__(self, msg):
 649         self.msg = msg
 650
 651 class MaxDownloadsReached(Exception):
 652     """ --max-downloads limit has been reached. """
 653     pass
 654
 655
 656 class UnavailableVideoError(Exception):
 657     """Unavailable Format exception.
 658
 659     This exception will be thrown when a video is requested
 660     in a format that is not available for that video.
 661     """
 662     pass
 663
 664
 665 class ContentTooShortError(Exception):
 666     """Content Too Short exception.
 667
 668     This exception may be raised by FileDownloader objects when a file they
 669     download is too small for what the server announced first, indicating
 670     the connection was probably interrupted.
 671     """
 672     # Both in bytes
 673     downloaded = None
 674     expected = None
 675
 676     def __init__(self, downloaded, expected):
 677         self.downloaded = downloaded
 678         self.expected = expected
 679
 680 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 681     """Handler for HTTP requests and responses.
 682
 683     This class, when installed with an OpenerDirector, automatically adds
 684     the standard headers to every HTTP request and handles gzipped and
 685     deflated responses from web servers. If compression is to be avoided in
 686     a particular request, the original request in the program code only has
 687     to include the HTTP header "Youtubedl-No-Compression", which will be
 688     removed before making the real request.
 689
 690     Part of this code was copied from:
 691
 692     http://techknack.net/python-urllib2-handlers/
 693
 694     Andrew Rowls, the author of that code, agreed to release it to the
 695     public domain.
 696     """
 697
 698     @staticmethod
 699     def deflate(data):
 700         try:
 701             return zlib.decompress(data, -zlib.MAX_WBITS)
 702         except zlib.error:
 703             return zlib.decompress(data)
 704
 705     @staticmethod
 706     def addinfourl_wrapper(stream, headers, url, code):
 707         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 708             return compat_urllib_request.addinfourl(stream, headers, url, code)
 709         ret = compat_urllib_request.addinfourl(stream, headers, url)
 710         ret.code = code
 711         return ret
 712
 713     def http_request(self, req):
 714         for h,v in std_headers.items():
 715             if h in req.headers:
 716                 del req.headers[h]
 717             req.add_header(h, v)
 718         if 'Youtubedl-no-compression' in req.headers:
 719             if 'Accept-encoding' in req.headers:
 720                 del req.headers['Accept-encoding']
 721             del req.headers['Youtubedl-no-compression']
 722         if 'Youtubedl-user-agent' in req.headers:
 723             if 'User-agent' in req.headers:
 724                 del req.headers['User-agent']
 725             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 726             del req.headers['Youtubedl-user-agent']
 727         return req
 728
 729     def http_response(self, req, resp):
 730         old_resp = resp
 731         # gzip
 732         if resp.headers.get('Content-encoding', '') == 'gzip':
 733             content = resp.read()
 734             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 735             try:
 736                 uncompressed = io.BytesIO(gz.read())
 737             except IOError as original_ioerror:
 738                 # There may be junk add the end of the file
 739                 # See http://stackoverflow.com/q/4928560/35070 for details
 740                 for i in range(1, 1024):
 741                     try:
 742                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 743                         uncompressed = io.BytesIO(gz.read())
 744                     except IOError:
 745                         continue
 746                     break
 747                 else:
 748                     raise original_ioerror
 749             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 750             resp.msg = old_resp.msg
 751         # deflate
 752         if resp.headers.get('Content-encoding', '') == 'deflate':
 753             gz = io.BytesIO(self.deflate(resp.read()))
 754             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 755             resp.msg = old_resp.msg
 756         return resp
 757
 758     https_request = http_request
 759     https_response = http_response
 760
 761
 762 def unified_strdate(date_str):
 763     """Return a string with the date in the format YYYYMMDD"""
 764     upload_date = None
 765     #Replace commas
 766     date_str = date_str.replace(',', ' ')
 767     # %z (UTC offset) is only supported in python>=3.2
 768     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 769     format_expressions = [
 770         '%d %B %Y',
 771         '%d %b %Y',
 772         '%B %d %Y',
 773         '%b %d %Y',
 774         '%Y-%m-%d',
 775         '%d.%m.%Y',
 776         '%d/%m/%Y',
 777         '%Y/%m/%d %H:%M:%S',
 778         '%Y-%m-%d %H:%M:%S',
 779         '%d.%m.%Y %H:%M',
 780         '%Y-%m-%dT%H:%M:%SZ',
 781         '%Y-%m-%dT%H:%M:%S.%fZ',
 782         '%Y-%m-%dT%H:%M:%S.%f0Z',
 783         '%Y-%m-%dT%H:%M:%S',
 784         '%Y-%m-%dT%H:%M:%S.%f',
 785         '%Y-%m-%dT%H:%M',
 786     ]
 787     for expression in format_expressions:
 788         try:
 789             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 790         except ValueError:
 791             pass
 792     if upload_date is None:
 793         timetuple = email.utils.parsedate_tz(date_str)
 794         if timetuple:
 795             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 796     return upload_date
 797
 798 def determine_ext(url, default_ext=u'unknown_video'):
 799     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 800     if re.match(r'^[A-Za-z0-9]+$', guess):
 801         return guess
 802     else:
 803         return default_ext
 804
 805 def subtitles_filename(filename, sub_lang, sub_format):
 806     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 807
 808 def date_from_str(date_str):
 809     """
 810     Return a datetime object from a string in the format YYYYMMDD or
 811     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 812     today = datetime.date.today()
 813     if date_str == 'now'or date_str == 'today':
 814         return today
 815     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 816     if match is not None:
 817         sign = match.group('sign')
 818         time = int(match.group('time'))
 819         if sign == '-':
 820             time = -time
 821         unit = match.group('unit')
 822         #A bad aproximation?
 823         if unit == 'month':
 824             unit = 'day'
 825             time *= 30
 826         elif unit == 'year':
 827             unit = 'day'
 828             time *= 365
 829         unit += 's'
 830         delta = datetime.timedelta(**{unit: time})
 831         return today + delta
 832     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 833
 834 def hyphenate_date(date_str):
 835     """
 836     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 837     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 838     if match is not None:
 839         return '-'.join(match.groups())
 840     else:
 841         return date_str
 842
 843 class DateRange(object):
 844     """Represents a time interval between two dates"""
 845     def __init__(self, start=None, end=None):
 846         """start and end must be strings in the format accepted by date"""
 847         if start is not None:
 848             self.start = date_from_str(start)
 849         else:
 850             self.start = datetime.datetime.min.date()
 851         if end is not None:
 852             self.end = date_from_str(end)
 853         else:
 854             self.end = datetime.datetime.max.date()
 855         if self.start > self.end:
 856             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 857     @classmethod
 858     def day(cls, day):
 859         """Returns a range that only contains the given day"""
 860         return cls(day,day)
 861     def __contains__(self, date):
 862         """Check if the date is in the range"""
 863         if not isinstance(date, datetime.date):
 864             date = date_from_str(date)
 865         return self.start <= date <= self.end
 866     def __str__(self):
 867         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 868
 869
 870 def platform_name():
 871     """ Returns the platform name as a compat_str """
 872     res = platform.platform()
 873     if isinstance(res, bytes):
 874         res = res.decode(preferredencoding())
 875
 876     assert isinstance(res, compat_str)
 877     return res
 878
 879
 880 def write_string(s, out=None):
 881     if out is None:
 882         out = sys.stderr
 883     assert type(s) == compat_str
 884
 885     if ('b' in getattr(out, 'mode', '') or
 886             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 887         s = s.encode(preferredencoding(), 'ignore')
 888     try:
 889         out.write(s)
 890     except UnicodeEncodeError:
 891         # In Windows shells, this can fail even when the codec is just charmap!?
 892         # See https://wiki.python.org/moin/PrintFails#Issue
 893         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 894             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 895             out.write(s)
 896         else:
 897             raise
 898
 899     out.flush()
 900
 901
 902 def bytes_to_intlist(bs):
 903     if not bs:
 904         return []
 905     if isinstance(bs[0], int):  # Python 3
 906         return list(bs)
 907     else:
 908         return [ord(c) for c in bs]
 909
 910
 911 def intlist_to_bytes(xs):
 912     if not xs:
 913         return b''
 914     if isinstance(chr(0), bytes):  # Python 2
 915         return ''.join([chr(x) for x in xs])
 916     else:
 917         return bytes(xs)
 918
 919
 920 def get_cachedir(params={}):
 921     cache_root = os.environ.get('XDG_CACHE_HOME',
 922                                 os.path.expanduser('~/.cache'))
 923     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 924
 925
 926 # Cross-platform file locking
 927 if sys.platform == 'win32':
 928     import ctypes.wintypes
 929     import msvcrt
 930
 931     class OVERLAPPED(ctypes.Structure):
 932         _fields_ = [
 933             ('Internal', ctypes.wintypes.LPVOID),
 934             ('InternalHigh', ctypes.wintypes.LPVOID),
 935             ('Offset', ctypes.wintypes.DWORD),
 936             ('OffsetHigh', ctypes.wintypes.DWORD),
 937             ('hEvent', ctypes.wintypes.HANDLE),
 938         ]
 939
 940     kernel32 = ctypes.windll.kernel32
 941     LockFileEx = kernel32.LockFileEx
 942     LockFileEx.argtypes = [
 943         ctypes.wintypes.HANDLE,     # hFile
 944         ctypes.wintypes.DWORD,      # dwFlags
 945         ctypes.wintypes.DWORD,      # dwReserved
 946         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 947         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 948         ctypes.POINTER(OVERLAPPED)  # Overlapped
 949     ]
 950     LockFileEx.restype = ctypes.wintypes.BOOL
 951     UnlockFileEx = kernel32.UnlockFileEx
 952     UnlockFileEx.argtypes = [
 953         ctypes.wintypes.HANDLE,     # hFile
 954         ctypes.wintypes.DWORD,      # dwReserved
 955         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 956         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 957         ctypes.POINTER(OVERLAPPED)  # Overlapped
 958     ]
 959     UnlockFileEx.restype = ctypes.wintypes.BOOL
 960     whole_low = 0xffffffff
 961     whole_high = 0x7fffffff
 962
 963     def _lock_file(f, exclusive):
 964         overlapped = OVERLAPPED()
 965         overlapped.Offset = 0
 966         overlapped.OffsetHigh = 0
 967         overlapped.hEvent = 0
 968         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 969         handle = msvcrt.get_osfhandle(f.fileno())
 970         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 971                           whole_low, whole_high, f._lock_file_overlapped_p):
 972             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 973
 974     def _unlock_file(f):
 975         assert f._lock_file_overlapped_p
 976         handle = msvcrt.get_osfhandle(f.fileno())
 977         if not UnlockFileEx(handle, 0,
 978                             whole_low, whole_high, f._lock_file_overlapped_p):
 979             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 980
 981 else:
 982     import fcntl
 983
 984     def _lock_file(f, exclusive):
 985         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 986
 987     def _unlock_file(f):
 988         fcntl.lockf(f, fcntl.LOCK_UN)
 989
 990
 991 class locked_file(object):
 992     def __init__(self, filename, mode, encoding=None):
 993         assert mode in ['r', 'a', 'w']
 994         self.f = io.open(filename, mode, encoding=encoding)
 995         self.mode = mode
 996
 997     def __enter__(self):
 998         exclusive = self.mode != 'r'
 999         try:
1000             _lock_file(self.f, exclusive)
1001         except IOError:
1002             self.f.close()
1003             raise
1004         return self
1005
1006     def __exit__(self, etype, value, traceback):
1007         try:
1008             _unlock_file(self.f)
1009         finally:
1010             self.f.close()
1011
1012     def __iter__(self):
1013         return iter(self.f)
1014
1015     def write(self, *args):
1016         return self.f.write(*args)
1017
1018     def read(self, *args):
1019         return self.f.read(*args)
1020
1021
1022 def shell_quote(args):
1023     quoted_args = []
1024     encoding = sys.getfilesystemencoding()
1025     if encoding is None:
1026         encoding = 'utf-8'
1027     for a in args:
1028         if isinstance(a, bytes):
1029             # We may get a filename encoded with 'encodeFilename'
1030             a = a.decode(encoding)
1031         quoted_args.append(pipes.quote(a))
1032     return u' '.join(quoted_args)
1033
1034
1035 def takewhile_inclusive(pred, seq):
1036     """ Like itertools.takewhile, but include the latest evaluated element
1037         (the first element so that Not pred(e)) """
1038     for e in seq:
1039         yield e
1040         if not pred(e):
1041             return
1042
1043
1044 def smuggle_url(url, data):
1045     """ Pass additional data in a URL for internal use. """
1046
1047     sdata = compat_urllib_parse.urlencode(
1048         {u'__youtubedl_smuggle': json.dumps(data)})
1049     return url + u'#' + sdata
1050
1051
1052 def unsmuggle_url(smug_url, default=None):
1053     if not '#__youtubedl_smuggle' in smug_url:
1054         return smug_url, default
1055     url, _, sdata = smug_url.rpartition(u'#')
1056     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1057     data = json.loads(jsond)
1058     return url, data
1059
1060
1061 def format_bytes(bytes):
1062     if bytes is None:
1063         return u'N/A'
1064     if type(bytes) is str:
1065         bytes = float(bytes)
1066     if bytes == 0.0:
1067         exponent = 0
1068     else:
1069         exponent = int(math.log(bytes, 1024.0))
1070     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1071     converted = float(bytes) / float(1024 ** exponent)
1072     return u'%.2f%s' % (converted, suffix)
1073
1074
1075 def str_to_int(int_str):
1076     int_str = re.sub(r'[,\.]', u'', int_str)
1077     return int(int_str)
1078
1079
1080 def get_term_width():
1081     columns = os.environ.get('COLUMNS', None)
1082     if columns:
1083         return int(columns)
1084
1085     try:
1086         sp = subprocess.Popen(
1087             ['stty', 'size'],
1088             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1089         out, err = sp.communicate()
1090         return int(out.split()[1])
1091     except:
1092         pass
1093     return None
1094
1095
1096 def month_by_name(name):
1097     """ Return the number of a month by (locale-independently) English name """
1098
1099     ENGLISH_NAMES = [
1100         u'January', u'February', u'March', u'April', u'May', u'June',
1101         u'July', u'August', u'September', u'October', u'November', u'December']
1102     try:
1103         return ENGLISH_NAMES.index(name) + 1
1104     except ValueError:
1105         return None
1106
1107
1108 def fix_xml_ampersands(xml_str):
1109     """Replace all the '&' by '&amp;' in XML"""
1110     return re.sub(
1111         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1112         u'&amp;',
1113         xml_str)
1114
1115
1116 def setproctitle(title):
1117     assert isinstance(title, compat_str)
1118     try:
1119         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1120     except OSError:
1121         return
1122     title = title
1123     buf = ctypes.create_string_buffer(len(title) + 1)
1124     buf.value = title.encode('utf-8')
1125     try:
1126         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1127     except AttributeError:
1128         return  # Strange libc, just skip this
1129
1130
1131 def remove_start(s, start):
1132     if s.startswith(start):
1133         return s[len(start):]
1134     return s
1135
1136
1137 def url_basename(url):
1138     path = compat_urlparse.urlparse(url).path
1139     return path.strip(u'/').split(u'/')[-1]
1140
1141
1142 class HEADRequest(compat_urllib_request.Request):
1143     def get_method(self):
1144         return "HEAD"
1145
1146
1147 def int_or_none(v, scale=1):
1148     return v if v is None else (int(v) // scale)
1149
1150
1151 def parse_duration(s):
1152     if s is None:
1153         return None
1154
1155     m = re.match(
1156         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1157     if not m:
1158         return None
1159     res = int(m.group('secs'))
1160     if m.group('mins'):
1161         res += int(m.group('mins')) * 60
1162         if m.group('hours'):
1163             res += int(m.group('hours')) * 60 * 60
1164     return res
1165
1166
1167 def prepend_extension(filename, ext):
1168     name, real_ext = os.path.splitext(filename)
1169     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1170
1171
1172 def check_executable(exe, args=[]):
1173     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1174     args can be a list of arguments for a short output (like -version) """
1175     try:
1176         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1177     except OSError:
1178         return False
1179     return exe
1180
1181
1182 class PagedList(object):
1183     def __init__(self, pagefunc, pagesize):
1184         self._pagefunc = pagefunc
1185         self._pagesize = pagesize
1186
1187     def __len__(self):
1188         # This is only useful for tests
1189         return len(self.getslice())
1190
1191     def getslice(self, start=0, end=None):
1192         res = []
1193         for pagenum in itertools.count(start // self._pagesize):
1194             firstid = pagenum * self._pagesize
1195             nextfirstid = pagenum * self._pagesize + self._pagesize
1196             if start >= nextfirstid:
1197                 continue
1198
1199             page_results = list(self._pagefunc(pagenum))
1200
1201             startv = (
1202                 start % self._pagesize
1203                 if firstid <= start < nextfirstid
1204                 else 0)
1205
1206             endv = (
1207                 ((end - 1) % self._pagesize) + 1
1208                 if (end is not None and firstid <= end <= nextfirstid)
1209                 else None)
1210
1211             if startv != 0 or endv is not None:
1212                 page_results = page_results[startv:endv]
1213             res.extend(page_results)
1214
1215             # A little optimization - if current page is not "full", ie. does
1216             # not contain page_size videos then we can assume that this page
1217             # is the last one - there are no more ids on further pages -
1218             # i.e. no need to query again.
1219             if len(page_results) + startv < self._pagesize:
1220                 break
1221
1222             # If we got the whole page, but the next page is not interesting,
1223             # break out early as well
1224             if end == nextfirstid:
1225                 break
1226         return res
1227
1228
1229 def uppercase_escape(s):
1230     return re.sub(
1231         r'\\U([0-9a-fA-F]{8})',
1232         lambda m: compat_chr(int(m.group(1), base=16)), s)
1233
1234 try:
1235     struct.pack(u'!I', 0)
1236 except TypeError:
1237     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1238     def struct_pack(spec, *args):
1239         if isinstance(spec, compat_str):
1240             spec = spec.encode('ascii')
1241         return struct.pack(spec, *args)
1242
1243     def struct_unpack(spec, *args):
1244         if isinstance(spec, compat_str):
1245             spec = spec.encode('ascii')
1246         return struct.unpack(spec, *args)
1247 else:
1248     struct_pack = struct.pack
1249     struct_unpack = struct.unpack
1250
1251
1252 def read_batch_urls(batch_fd):
1253     def fixup(url):
1254         if not isinstance(url, compat_str):
1255             url = url.decode('utf-8', 'replace')
1256         BOM_UTF8 = u'\xef\xbb\xbf'
1257         if url.startswith(BOM_UTF8):
1258             url = url[len(BOM_UTF8):]
1259         url = url.strip()
1260         if url.startswith(('#', ';', ']')):
1261             return False
1262         return url
1263
1264     with contextlib.closing(batch_fd) as fd:
1265         return [url for url in map(fixup, fd) if url]