youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import calendar
   5 import contextlib
   6 import ctypes
   7 import datetime
   8 import email.utils
   9 import errno
  10 import getpass
  11 import gzip
  12 import itertools
  13 import io
  14 import json
  15 import locale
  16 import math
  17 import os
  18 import pipes
  19 import platform
  20 import re
  21 import ssl
  22 import socket
  23 import struct
  24 import subprocess
  25 import sys
  26 import traceback
  27 import xml.etree.ElementTree
  28 import zlib
  29
  30 try:
  31     import urllib.request as compat_urllib_request
  32 except ImportError: # Python 2
  33     import urllib2 as compat_urllib_request
  34
  35 try:
  36     import urllib.error as compat_urllib_error
  37 except ImportError: # Python 2
  38     import urllib2 as compat_urllib_error
  39
  40 try:
  41     import urllib.parse as compat_urllib_parse
  42 except ImportError: # Python 2
  43     import urllib as compat_urllib_parse
  44
  45 try:
  46     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  47 except ImportError: # Python 2
  48     from urlparse import urlparse as compat_urllib_parse_urlparse
  49
  50 try:
  51     import urllib.parse as compat_urlparse
  52 except ImportError: # Python 2
  53     import urlparse as compat_urlparse
  54
  55 try:
  56     import http.cookiejar as compat_cookiejar
  57 except ImportError: # Python 2
  58     import cookielib as compat_cookiejar
  59
  60 try:
  61     import html.entities as compat_html_entities
  62 except ImportError: # Python 2
  63     import htmlentitydefs as compat_html_entities
  64
  65 try:
  66     import html.parser as compat_html_parser
  67 except ImportError: # Python 2
  68     import HTMLParser as compat_html_parser
  69
  70 try:
  71     import http.client as compat_http_client
  72 except ImportError: # Python 2
  73     import httplib as compat_http_client
  74
  75 try:
  76     from urllib.error import HTTPError as compat_HTTPError
  77 except ImportError:  # Python 2
  78     from urllib2 import HTTPError as compat_HTTPError
  79
  80 try:
  81     from urllib.request import urlretrieve as compat_urlretrieve
  82 except ImportError:  # Python 2
  83     from urllib import urlretrieve as compat_urlretrieve
  84
  85
  86 try:
  87     from subprocess import DEVNULL
  88     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  89 except ImportError:
  90     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  91
  92 try:
  93     from urllib.parse import parse_qs as compat_parse_qs
  94 except ImportError: # Python 2
  95     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  96     # Python 2's version is apparently totally broken
  97     def _unquote(string, encoding='utf-8', errors='replace'):
  98         if string == '':
  99             return string
 100         res = string.split('%')
 101         if len(res) == 1:
 102             return string
 103         if encoding is None:
 104             encoding = 'utf-8'
 105         if errors is None:
 106             errors = 'replace'
 107         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 108         pct_sequence = b''
 109         string = res[0]
 110         for item in res[1:]:
 111             try:
 112                 if not item:
 113                     raise ValueError
 114                 pct_sequence += item[:2].decode('hex')
 115                 rest = item[2:]
 116                 if not rest:
 117                     # This segment was just a single percent-encoded character.
 118                     # May be part of a sequence of code units, so delay decoding.
 119                     # (Stored in pct_sequence).
 120                     continue
 121             except ValueError:
 122                 rest = '%' + item
 123             # Encountered non-percent-encoded characters. Flush the current
 124             # pct_sequence.
 125             string += pct_sequence.decode(encoding, errors) + rest
 126             pct_sequence = b''
 127         if pct_sequence:
 128             # Flush the final pct_sequence
 129             string += pct_sequence.decode(encoding, errors)
 130         return string
 131
 132     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 133                 encoding='utf-8', errors='replace'):
 134         qs, _coerce_result = qs, unicode
 135         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 136         r = []
 137         for name_value in pairs:
 138             if not name_value and not strict_parsing:
 139                 continue
 140             nv = name_value.split('=', 1)
 141             if len(nv) != 2:
 142                 if strict_parsing:
 143                     raise ValueError("bad query field: %r" % (name_value,))
 144                 # Handle case of a control-name with no equal sign
 145                 if keep_blank_values:
 146                     nv.append('')
 147                 else:
 148                     continue
 149             if len(nv[1]) or keep_blank_values:
 150                 name = nv[0].replace('+', ' ')
 151                 name = _unquote(name, encoding=encoding, errors=errors)
 152                 name = _coerce_result(name)
 153                 value = nv[1].replace('+', ' ')
 154                 value = _unquote(value, encoding=encoding, errors=errors)
 155                 value = _coerce_result(value)
 156                 r.append((name, value))
 157         return r
 158
 159     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 160                 encoding='utf-8', errors='replace'):
 161         parsed_result = {}
 162         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 163                         encoding=encoding, errors=errors)
 164         for name, value in pairs:
 165             if name in parsed_result:
 166                 parsed_result[name].append(value)
 167             else:
 168                 parsed_result[name] = [value]
 169         return parsed_result
 170
 171 try:
 172     compat_str = unicode # Python 2
 173 except NameError:
 174     compat_str = str
 175
 176 try:
 177     compat_chr = unichr # Python 2
 178 except NameError:
 179     compat_chr = chr
 180
 181 try:
 182     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 183 except ImportError:  # Python 2.6
 184     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 185
 186 def compat_ord(c):
 187     if type(c) is int: return c
 188     else: return ord(c)
 189
 190 # This is not clearly defined otherwise
 191 compiled_regex_type = type(re.compile(''))
 192
 193 std_headers = {
 194     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 195     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 196     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 197     'Accept-Encoding': 'gzip, deflate',
 198     'Accept-Language': 'en-us,en;q=0.5',
 199 }
 200
 201 def preferredencoding():
 202     """Get preferred encoding.
 203
 204     Returns the best encoding scheme for the system, based on
 205     locale.getpreferredencoding() and some further tweaks.
 206     """
 207     try:
 208         pref = locale.getpreferredencoding()
 209         u'TEST'.encode(pref)
 210     except:
 211         pref = 'UTF-8'
 212
 213     return pref
 214
 215 if sys.version_info < (3,0):
 216     def compat_print(s):
 217         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 218 else:
 219     def compat_print(s):
 220         assert type(s) == type(u'')
 221         print(s)
 222
 223 # In Python 2.x, json.dump expects a bytestream.
 224 # In Python 3.x, it writes to a character stream
 225 if sys.version_info < (3,0):
 226     def write_json_file(obj, fn):
 227         with open(fn, 'wb') as f:
 228             json.dump(obj, f)
 229 else:
 230     def write_json_file(obj, fn):
 231         with open(fn, 'w', encoding='utf-8') as f:
 232             json.dump(obj, f)
 233
 234 if sys.version_info >= (2,7):
 235     def find_xpath_attr(node, xpath, key, val):
 236         """ Find the xpath xpath[@key=val] """
 237         assert re.match(r'^[a-zA-Z]+$', key)
 238         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 239         expr = xpath + u"[@%s='%s']" % (key, val)
 240         return node.find(expr)
 241 else:
 242     def find_xpath_attr(node, xpath, key, val):
 243         for f in node.findall(xpath):
 244             if f.attrib.get(key) == val:
 245                 return f
 246         return None
 247
 248 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 249 # the namespace parameter
 250 def xpath_with_ns(path, ns_map):
 251     components = [c.split(':') for c in path.split('/')]
 252     replaced = []
 253     for c in components:
 254         if len(c) == 1:
 255             replaced.append(c[0])
 256         else:
 257             ns, tag = c
 258             replaced.append('{%s}%s' % (ns_map[ns], tag))
 259     return '/'.join(replaced)
 260
 261 def htmlentity_transform(matchobj):
 262     """Transforms an HTML entity to a character.
 263
 264     This function receives a match object and is intended to be used with
 265     the re.sub() function.
 266     """
 267     entity = matchobj.group(1)
 268
 269     # Known non-numeric HTML entity
 270     if entity in compat_html_entities.name2codepoint:
 271         return compat_chr(compat_html_entities.name2codepoint[entity])
 272
 273     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 274     if mobj is not None:
 275         numstr = mobj.group(1)
 276         if numstr.startswith(u'x'):
 277             base = 16
 278             numstr = u'0%s' % numstr
 279         else:
 280             base = 10
 281         return compat_chr(int(numstr, base))
 282
 283     # Unknown entity in name, return its literal representation
 284     return (u'&%s;' % entity)
 285
 286 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 287 class BaseHTMLParser(compat_html_parser.HTMLParser):
 288     def __init(self):
 289         compat_html_parser.HTMLParser.__init__(self)
 290         self.html = None
 291
 292     def loads(self, html):
 293         self.html = html
 294         self.feed(html)
 295         self.close()
 296
 297 class AttrParser(BaseHTMLParser):
 298     """Modified HTMLParser that isolates a tag with the specified attribute"""
 299     def __init__(self, attribute, value):
 300         self.attribute = attribute
 301         self.value = value
 302         self.result = None
 303         self.started = False
 304         self.depth = {}
 305         self.watch_startpos = False
 306         self.error_count = 0
 307         BaseHTMLParser.__init__(self)
 308
 309     def error(self, message):
 310         if self.error_count > 10 or self.started:
 311             raise compat_html_parser.HTMLParseError(message, self.getpos())
 312         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 313         self.error_count += 1
 314         self.goahead(1)
 315
 316     def handle_starttag(self, tag, attrs):
 317         attrs = dict(attrs)
 318         if self.started:
 319             self.find_startpos(None)
 320         if self.attribute in attrs and attrs[self.attribute] == self.value:
 321             self.result = [tag]
 322             self.started = True
 323             self.watch_startpos = True
 324         if self.started:
 325             if not tag in self.depth: self.depth[tag] = 0
 326             self.depth[tag] += 1
 327
 328     def handle_endtag(self, tag):
 329         if self.started:
 330             if tag in self.depth: self.depth[tag] -= 1
 331             if self.depth[self.result[0]] == 0:
 332                 self.started = False
 333                 self.result.append(self.getpos())
 334
 335     def find_startpos(self, x):
 336         """Needed to put the start position of the result (self.result[1])
 337         after the opening tag with the requested id"""
 338         if self.watch_startpos:
 339             self.watch_startpos = False
 340             self.result.append(self.getpos())
 341     handle_entityref = handle_charref = handle_data = handle_comment = \
 342     handle_decl = handle_pi = unknown_decl = find_startpos
 343
 344     def get_result(self):
 345         if self.result is None:
 346             return None
 347         if len(self.result) != 3:
 348             return None
 349         lines = self.html.split('\n')
 350         lines = lines[self.result[1][0]-1:self.result[2][0]]
 351         lines[0] = lines[0][self.result[1][1]:]
 352         if len(lines) == 1:
 353             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 354         lines[-1] = lines[-1][:self.result[2][1]]
 355         return '\n'.join(lines).strip()
 356 # Hack for https://github.com/rg3/youtube-dl/issues/662
 357 if sys.version_info < (2, 7, 3):
 358     AttrParser.parse_endtag = (lambda self, i:
 359         i + len("</scr'+'ipt>")
 360         if self.rawdata[i:].startswith("</scr'+'ipt>")
 361         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 362
 363 def get_element_by_id(id, html):
 364     """Return the content of the tag with the specified ID in the passed HTML document"""
 365     return get_element_by_attribute("id", id, html)
 366
 367 def get_element_by_attribute(attribute, value, html):
 368     """Return the content of the tag with the specified attribute in the passed HTML document"""
 369     parser = AttrParser(attribute, value)
 370     try:
 371         parser.loads(html)
 372     except compat_html_parser.HTMLParseError:
 373         pass
 374     return parser.get_result()
 375
 376 class MetaParser(BaseHTMLParser):
 377     """
 378     Modified HTMLParser that isolates a meta tag with the specified name
 379     attribute.
 380     """
 381     def __init__(self, name):
 382         BaseHTMLParser.__init__(self)
 383         self.name = name
 384         self.content = None
 385         self.result = None
 386
 387     def handle_starttag(self, tag, attrs):
 388         if tag != 'meta':
 389             return
 390         attrs = dict(attrs)
 391         if attrs.get('name') == self.name:
 392             self.result = attrs.get('content')
 393
 394     def get_result(self):
 395         return self.result
 396
 397 def get_meta_content(name, html):
 398     """
 399     Return the content attribute from the meta tag with the given name attribute.
 400     """
 401     parser = MetaParser(name)
 402     try:
 403         parser.loads(html)
 404     except compat_html_parser.HTMLParseError:
 405         pass
 406     return parser.get_result()
 407
 408
 409 def clean_html(html):
 410     """Clean an HTML snippet into a readable string"""
 411     # Newline vs <br />
 412     html = html.replace('\n', ' ')
 413     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 414     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 415     # Strip html tags
 416     html = re.sub('<.*?>', '', html)
 417     # Replace html entities
 418     html = unescapeHTML(html)
 419     return html.strip()
 420
 421
 422 def sanitize_open(filename, open_mode):
 423     """Try to open the given filename, and slightly tweak it if this fails.
 424
 425     Attempts to open the given filename. If this fails, it tries to change
 426     the filename slightly, step by step, until it's either able to open it
 427     or it fails and raises a final exception, like the standard open()
 428     function.
 429
 430     It returns the tuple (stream, definitive_file_name).
 431     """
 432     try:
 433         if filename == u'-':
 434             if sys.platform == 'win32':
 435                 import msvcrt
 436                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 437             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 438         stream = open(encodeFilename(filename), open_mode)
 439         return (stream, filename)
 440     except (IOError, OSError) as err:
 441         if err.errno in (errno.EACCES,):
 442             raise
 443
 444         # In case of error, try to remove win32 forbidden chars
 445         alt_filename = os.path.join(
 446                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 447                         for path_part in os.path.split(filename)
 448                        )
 449         if alt_filename == filename:
 450             raise
 451         else:
 452             # An exception here should be caught in the caller
 453             stream = open(encodeFilename(filename), open_mode)
 454             return (stream, alt_filename)
 455
 456
 457 def timeconvert(timestr):
 458     """Convert RFC 2822 defined time string into system timestamp"""
 459     timestamp = None
 460     timetuple = email.utils.parsedate_tz(timestr)
 461     if timetuple is not None:
 462         timestamp = email.utils.mktime_tz(timetuple)
 463     return timestamp
 464
 465 def sanitize_filename(s, restricted=False, is_id=False):
 466     """Sanitizes a string so it could be used as part of a filename.
 467     If restricted is set, use a stricter subset of allowed characters.
 468     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 469     """
 470     def replace_insane(char):
 471         if char == '?' or ord(char) < 32 or ord(char) == 127:
 472             return ''
 473         elif char == '"':
 474             return '' if restricted else '\''
 475         elif char == ':':
 476             return '_-' if restricted else ' -'
 477         elif char in '\\/|*<>':
 478             return '_'
 479         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 480             return '_'
 481         if restricted and ord(char) > 127:
 482             return '_'
 483         return char
 484
 485     result = u''.join(map(replace_insane, s))
 486     if not is_id:
 487         while '__' in result:
 488             result = result.replace('__', '_')
 489         result = result.strip('_')
 490         # Common case of "Foreign band name - English song title"
 491         if restricted and result.startswith('-_'):
 492             result = result[2:]
 493         if not result:
 494             result = '_'
 495     return result
 496
 497 def orderedSet(iterable):
 498     """ Remove all duplicates from the input iterable """
 499     res = []
 500     for el in iterable:
 501         if el not in res:
 502             res.append(el)
 503     return res
 504
 505
 506 def unescapeHTML(s):
 507     if s is None:
 508         return None
 509     assert type(s) == compat_str
 510
 511     result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
 512     return result
 513
 514
 515 def encodeFilename(s, for_subprocess=False):
 516     """
 517     @param s The name of the file
 518     """
 519
 520     assert type(s) == compat_str
 521
 522     # Python 3 has a Unicode API
 523     if sys.version_info >= (3, 0):
 524         return s
 525
 526     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 527         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 528         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 529         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 530         if not for_subprocess:
 531             return s
 532         else:
 533             # For subprocess calls, encode with locale encoding
 534             # Refer to http://stackoverflow.com/a/9951851/35070
 535             encoding = preferredencoding()
 536     else:
 537         encoding = sys.getfilesystemencoding()
 538     if encoding is None:
 539         encoding = 'utf-8'
 540     return s.encode(encoding, 'ignore')
 541
 542
 543 def decodeOption(optval):
 544     if optval is None:
 545         return optval
 546     if isinstance(optval, bytes):
 547         optval = optval.decode(preferredencoding())
 548
 549     assert isinstance(optval, compat_str)
 550     return optval
 551
 552 def formatSeconds(secs):
 553     if secs > 3600:
 554         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 555     elif secs > 60:
 556         return '%d:%02d' % (secs // 60, secs % 60)
 557     else:
 558         return '%d' % secs
 559
 560
 561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 562     if sys.version_info < (3, 2):
 563         import httplib
 564
 565         class HTTPSConnectionV3(httplib.HTTPSConnection):
 566             def __init__(self, *args, **kwargs):
 567                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 568
 569             def connect(self):
 570                 sock = socket.create_connection((self.host, self.port), self.timeout)
 571                 if getattr(self, '_tunnel_host', False):
 572                     self.sock = sock
 573                     self._tunnel()
 574                 try:
 575                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 576                 except ssl.SSLError:
 577                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 578
 579         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 580             def https_open(self, req):
 581                 return self.do_open(HTTPSConnectionV3, req)
 582         return HTTPSHandlerV3(**kwargs)
 583     else:
 584         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 585         context.verify_mode = (ssl.CERT_NONE
 586                                if opts_no_check_certificate
 587                                else ssl.CERT_REQUIRED)
 588         context.set_default_verify_paths()
 589         try:
 590             context.load_default_certs()
 591         except AttributeError:
 592             pass  # Python < 3.4
 593         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 594
 595 class ExtractorError(Exception):
 596     """Error during info extraction."""
 597     def __init__(self, msg, tb=None, expected=False, cause=None):
 598         """ tb, if given, is the original traceback (so that it can be printed out).
 599         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 600         """
 601
 602         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 603             expected = True
 604         if not expected:
 605             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 606         super(ExtractorError, self).__init__(msg)
 607
 608         self.traceback = tb
 609         self.exc_info = sys.exc_info()  # preserve original exception
 610         self.cause = cause
 611
 612     def format_traceback(self):
 613         if self.traceback is None:
 614             return None
 615         return u''.join(traceback.format_tb(self.traceback))
 616
 617
 618 class RegexNotFoundError(ExtractorError):
 619     """Error when a regex didn't match"""
 620     pass
 621
 622
 623 class DownloadError(Exception):
 624     """Download Error exception.
 625
 626     This exception may be thrown by FileDownloader objects if they are not
 627     configured to continue on errors. They will contain the appropriate
 628     error message.
 629     """
 630     def __init__(self, msg, exc_info=None):
 631         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 632         super(DownloadError, self).__init__(msg)
 633         self.exc_info = exc_info
 634
 635
 636 class SameFileError(Exception):
 637     """Same File exception.
 638
 639     This exception will be thrown by FileDownloader objects if they detect
 640     multiple files would have to be downloaded to the same file on disk.
 641     """
 642     pass
 643
 644
 645 class PostProcessingError(Exception):
 646     """Post Processing exception.
 647
 648     This exception may be raised by PostProcessor's .run() method to
 649     indicate an error in the postprocessing task.
 650     """
 651     def __init__(self, msg):
 652         self.msg = msg
 653
 654 class MaxDownloadsReached(Exception):
 655     """ --max-downloads limit has been reached. """
 656     pass
 657
 658
 659 class UnavailableVideoError(Exception):
 660     """Unavailable Format exception.
 661
 662     This exception will be thrown when a video is requested
 663     in a format that is not available for that video.
 664     """
 665     pass
 666
 667
 668 class ContentTooShortError(Exception):
 669     """Content Too Short exception.
 670
 671     This exception may be raised by FileDownloader objects when a file they
 672     download is too small for what the server announced first, indicating
 673     the connection was probably interrupted.
 674     """
 675     # Both in bytes
 676     downloaded = None
 677     expected = None
 678
 679     def __init__(self, downloaded, expected):
 680         self.downloaded = downloaded
 681         self.expected = expected
 682
 683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 684     """Handler for HTTP requests and responses.
 685
 686     This class, when installed with an OpenerDirector, automatically adds
 687     the standard headers to every HTTP request and handles gzipped and
 688     deflated responses from web servers. If compression is to be avoided in
 689     a particular request, the original request in the program code only has
 690     to include the HTTP header "Youtubedl-No-Compression", which will be
 691     removed before making the real request.
 692
 693     Part of this code was copied from:
 694
 695     http://techknack.net/python-urllib2-handlers/
 696
 697     Andrew Rowls, the author of that code, agreed to release it to the
 698     public domain.
 699     """
 700
 701     @staticmethod
 702     def deflate(data):
 703         try:
 704             return zlib.decompress(data, -zlib.MAX_WBITS)
 705         except zlib.error:
 706             return zlib.decompress(data)
 707
 708     @staticmethod
 709     def addinfourl_wrapper(stream, headers, url, code):
 710         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 711             return compat_urllib_request.addinfourl(stream, headers, url, code)
 712         ret = compat_urllib_request.addinfourl(stream, headers, url)
 713         ret.code = code
 714         return ret
 715
 716     def http_request(self, req):
 717         for h,v in std_headers.items():
 718             if h in req.headers:
 719                 del req.headers[h]
 720             req.add_header(h, v)
 721         if 'Youtubedl-no-compression' in req.headers:
 722             if 'Accept-encoding' in req.headers:
 723                 del req.headers['Accept-encoding']
 724             del req.headers['Youtubedl-no-compression']
 725         if 'Youtubedl-user-agent' in req.headers:
 726             if 'User-agent' in req.headers:
 727                 del req.headers['User-agent']
 728             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 729             del req.headers['Youtubedl-user-agent']
 730         return req
 731
 732     def http_response(self, req, resp):
 733         old_resp = resp
 734         # gzip
 735         if resp.headers.get('Content-encoding', '') == 'gzip':
 736             content = resp.read()
 737             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 738             try:
 739                 uncompressed = io.BytesIO(gz.read())
 740             except IOError as original_ioerror:
 741                 # There may be junk add the end of the file
 742                 # See http://stackoverflow.com/q/4928560/35070 for details
 743                 for i in range(1, 1024):
 744                     try:
 745                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 746                         uncompressed = io.BytesIO(gz.read())
 747                     except IOError:
 748                         continue
 749                     break
 750                 else:
 751                     raise original_ioerror
 752             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 753             resp.msg = old_resp.msg
 754         # deflate
 755         if resp.headers.get('Content-encoding', '') == 'deflate':
 756             gz = io.BytesIO(self.deflate(resp.read()))
 757             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 758             resp.msg = old_resp.msg
 759         return resp
 760
 761     https_request = http_request
 762     https_response = http_response
 763
 764
 765 def parse_iso8601(date_str):
 766     """ Return a UNIX timestamp from the given date """
 767
 768     if date_str is None:
 769         return None
 770
 771     m = re.search(
 772         r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
 773         date_str)
 774     if not m:
 775         timezone = datetime.timedelta()
 776     else:
 777         date_str = date_str[:-len(m.group(0))]
 778         if not m.group('sign'):
 779             timezone = datetime.timedelta()
 780         else:
 781             sign = 1 if m.group('sign') == '+' else -1
 782             timezone = datetime.timedelta(
 783                 hours=sign * int(m.group('hours')),
 784                 minutes=sign * int(m.group('minutes')))
 785
 786     dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
 787     return calendar.timegm(dt.timetuple())
 788
 789
 790 def unified_strdate(date_str):
 791     """Return a string with the date in the format YYYYMMDD"""
 792
 793     if date_str is None:
 794         return None
 795
 796     upload_date = None
 797     #Replace commas
 798     date_str = date_str.replace(',', ' ')
 799     # %z (UTC offset) is only supported in python>=3.2
 800     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 801     format_expressions = [
 802         '%d %B %Y',
 803         '%d %b %Y',
 804         '%B %d %Y',
 805         '%b %d %Y',
 806         '%Y-%m-%d',
 807         '%d.%m.%Y',
 808         '%d/%m/%Y',
 809         '%Y/%m/%d %H:%M:%S',
 810         '%Y-%m-%d %H:%M:%S',
 811         '%d.%m.%Y %H:%M',
 812         '%d.%m.%Y %H.%M',
 813         '%Y-%m-%dT%H:%M:%SZ',
 814         '%Y-%m-%dT%H:%M:%S.%fZ',
 815         '%Y-%m-%dT%H:%M:%S.%f0Z',
 816         '%Y-%m-%dT%H:%M:%S',
 817         '%Y-%m-%dT%H:%M:%S.%f',
 818         '%Y-%m-%dT%H:%M',
 819     ]
 820     for expression in format_expressions:
 821         try:
 822             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 823         except ValueError:
 824             pass
 825     if upload_date is None:
 826         timetuple = email.utils.parsedate_tz(date_str)
 827         if timetuple:
 828             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 829     return upload_date
 830
 831 def determine_ext(url, default_ext=u'unknown_video'):
 832     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 833     if re.match(r'^[A-Za-z0-9]+$', guess):
 834         return guess
 835     else:
 836         return default_ext
 837
 838 def subtitles_filename(filename, sub_lang, sub_format):
 839     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 840
 841 def date_from_str(date_str):
 842     """
 843     Return a datetime object from a string in the format YYYYMMDD or
 844     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 845     today = datetime.date.today()
 846     if date_str == 'now'or date_str == 'today':
 847         return today
 848     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 849     if match is not None:
 850         sign = match.group('sign')
 851         time = int(match.group('time'))
 852         if sign == '-':
 853             time = -time
 854         unit = match.group('unit')
 855         #A bad aproximation?
 856         if unit == 'month':
 857             unit = 'day'
 858             time *= 30
 859         elif unit == 'year':
 860             unit = 'day'
 861             time *= 365
 862         unit += 's'
 863         delta = datetime.timedelta(**{unit: time})
 864         return today + delta
 865     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 866
 867 def hyphenate_date(date_str):
 868     """
 869     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 870     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 871     if match is not None:
 872         return '-'.join(match.groups())
 873     else:
 874         return date_str
 875
 876 class DateRange(object):
 877     """Represents a time interval between two dates"""
 878     def __init__(self, start=None, end=None):
 879         """start and end must be strings in the format accepted by date"""
 880         if start is not None:
 881             self.start = date_from_str(start)
 882         else:
 883             self.start = datetime.datetime.min.date()
 884         if end is not None:
 885             self.end = date_from_str(end)
 886         else:
 887             self.end = datetime.datetime.max.date()
 888         if self.start > self.end:
 889             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 890     @classmethod
 891     def day(cls, day):
 892         """Returns a range that only contains the given day"""
 893         return cls(day,day)
 894     def __contains__(self, date):
 895         """Check if the date is in the range"""
 896         if not isinstance(date, datetime.date):
 897             date = date_from_str(date)
 898         return self.start <= date <= self.end
 899     def __str__(self):
 900         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 901
 902
 903 def platform_name():
 904     """ Returns the platform name as a compat_str """
 905     res = platform.platform()
 906     if isinstance(res, bytes):
 907         res = res.decode(preferredencoding())
 908
 909     assert isinstance(res, compat_str)
 910     return res
 911
 912
 913 def write_string(s, out=None):
 914     if out is None:
 915         out = sys.stderr
 916     assert type(s) == compat_str
 917
 918     if ('b' in getattr(out, 'mode', '') or
 919             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 920         s = s.encode(preferredencoding(), 'ignore')
 921     try:
 922         out.write(s)
 923     except UnicodeEncodeError:
 924         # In Windows shells, this can fail even when the codec is just charmap!?
 925         # See https://wiki.python.org/moin/PrintFails#Issue
 926         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 927             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 928             out.write(s)
 929         else:
 930             raise
 931
 932     out.flush()
 933
 934
 935 def bytes_to_intlist(bs):
 936     if not bs:
 937         return []
 938     if isinstance(bs[0], int):  # Python 3
 939         return list(bs)
 940     else:
 941         return [ord(c) for c in bs]
 942
 943
 944 def intlist_to_bytes(xs):
 945     if not xs:
 946         return b''
 947     if isinstance(chr(0), bytes):  # Python 2
 948         return ''.join([chr(x) for x in xs])
 949     else:
 950         return bytes(xs)
 951
 952
 953 def get_cachedir(params={}):
 954     cache_root = os.environ.get('XDG_CACHE_HOME',
 955                                 os.path.expanduser('~/.cache'))
 956     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 957
 958
 959 # Cross-platform file locking
 960 if sys.platform == 'win32':
 961     import ctypes.wintypes
 962     import msvcrt
 963
 964     class OVERLAPPED(ctypes.Structure):
 965         _fields_ = [
 966             ('Internal', ctypes.wintypes.LPVOID),
 967             ('InternalHigh', ctypes.wintypes.LPVOID),
 968             ('Offset', ctypes.wintypes.DWORD),
 969             ('OffsetHigh', ctypes.wintypes.DWORD),
 970             ('hEvent', ctypes.wintypes.HANDLE),
 971         ]
 972
 973     kernel32 = ctypes.windll.kernel32
 974     LockFileEx = kernel32.LockFileEx
 975     LockFileEx.argtypes = [
 976         ctypes.wintypes.HANDLE,     # hFile
 977         ctypes.wintypes.DWORD,      # dwFlags
 978         ctypes.wintypes.DWORD,      # dwReserved
 979         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 980         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 981         ctypes.POINTER(OVERLAPPED)  # Overlapped
 982     ]
 983     LockFileEx.restype = ctypes.wintypes.BOOL
 984     UnlockFileEx = kernel32.UnlockFileEx
 985     UnlockFileEx.argtypes = [
 986         ctypes.wintypes.HANDLE,     # hFile
 987         ctypes.wintypes.DWORD,      # dwReserved
 988         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 989         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 990         ctypes.POINTER(OVERLAPPED)  # Overlapped
 991     ]
 992     UnlockFileEx.restype = ctypes.wintypes.BOOL
 993     whole_low = 0xffffffff
 994     whole_high = 0x7fffffff
 995
 996     def _lock_file(f, exclusive):
 997         overlapped = OVERLAPPED()
 998         overlapped.Offset = 0
 999         overlapped.OffsetHigh = 0
1000         overlapped.hEvent = 0
1001         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1002         handle = msvcrt.get_osfhandle(f.fileno())
1003         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1004                           whole_low, whole_high, f._lock_file_overlapped_p):
1005             raise OSError('Locking file failed: %r' % ctypes.FormatError())
1006
1007     def _unlock_file(f):
1008         assert f._lock_file_overlapped_p
1009         handle = msvcrt.get_osfhandle(f.fileno())
1010         if not UnlockFileEx(handle, 0,
1011                             whole_low, whole_high, f._lock_file_overlapped_p):
1012             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1013
1014 else:
1015     import fcntl
1016
1017     def _lock_file(f, exclusive):
1018         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1019
1020     def _unlock_file(f):
1021         fcntl.lockf(f, fcntl.LOCK_UN)
1022
1023
1024 class locked_file(object):
1025     def __init__(self, filename, mode, encoding=None):
1026         assert mode in ['r', 'a', 'w']
1027         self.f = io.open(filename, mode, encoding=encoding)
1028         self.mode = mode
1029
1030     def __enter__(self):
1031         exclusive = self.mode != 'r'
1032         try:
1033             _lock_file(self.f, exclusive)
1034         except IOError:
1035             self.f.close()
1036             raise
1037         return self
1038
1039     def __exit__(self, etype, value, traceback):
1040         try:
1041             _unlock_file(self.f)
1042         finally:
1043             self.f.close()
1044
1045     def __iter__(self):
1046         return iter(self.f)
1047
1048     def write(self, *args):
1049         return self.f.write(*args)
1050
1051     def read(self, *args):
1052         return self.f.read(*args)
1053
1054
1055 def shell_quote(args):
1056     quoted_args = []
1057     encoding = sys.getfilesystemencoding()
1058     if encoding is None:
1059         encoding = 'utf-8'
1060     for a in args:
1061         if isinstance(a, bytes):
1062             # We may get a filename encoded with 'encodeFilename'
1063             a = a.decode(encoding)
1064         quoted_args.append(pipes.quote(a))
1065     return u' '.join(quoted_args)
1066
1067
1068 def takewhile_inclusive(pred, seq):
1069     """ Like itertools.takewhile, but include the latest evaluated element
1070         (the first element so that Not pred(e)) """
1071     for e in seq:
1072         yield e
1073         if not pred(e):
1074             return
1075
1076
1077 def smuggle_url(url, data):
1078     """ Pass additional data in a URL for internal use. """
1079
1080     sdata = compat_urllib_parse.urlencode(
1081         {u'__youtubedl_smuggle': json.dumps(data)})
1082     return url + u'#' + sdata
1083
1084
1085 def unsmuggle_url(smug_url, default=None):
1086     if not '#__youtubedl_smuggle' in smug_url:
1087         return smug_url, default
1088     url, _, sdata = smug_url.rpartition(u'#')
1089     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1090     data = json.loads(jsond)
1091     return url, data
1092
1093
1094 def format_bytes(bytes):
1095     if bytes is None:
1096         return u'N/A'
1097     if type(bytes) is str:
1098         bytes = float(bytes)
1099     if bytes == 0.0:
1100         exponent = 0
1101     else:
1102         exponent = int(math.log(bytes, 1024.0))
1103     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1104     converted = float(bytes) / float(1024 ** exponent)
1105     return u'%.2f%s' % (converted, suffix)
1106
1107
1108 def str_to_int(int_str):
1109     int_str = re.sub(r'[,\.]', u'', int_str)
1110     return int(int_str)
1111
1112
1113 def get_term_width():
1114     columns = os.environ.get('COLUMNS', None)
1115     if columns:
1116         return int(columns)
1117
1118     try:
1119         sp = subprocess.Popen(
1120             ['stty', 'size'],
1121             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1122         out, err = sp.communicate()
1123         return int(out.split()[1])
1124     except:
1125         pass
1126     return None
1127
1128
1129 def month_by_name(name):
1130     """ Return the number of a month by (locale-independently) English name """
1131
1132     ENGLISH_NAMES = [
1133         u'January', u'February', u'March', u'April', u'May', u'June',
1134         u'July', u'August', u'September', u'October', u'November', u'December']
1135     try:
1136         return ENGLISH_NAMES.index(name) + 1
1137     except ValueError:
1138         return None
1139
1140
1141 def fix_xml_ampersands(xml_str):
1142     """Replace all the '&' by '&amp;' in XML"""
1143     return re.sub(
1144         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1145         u'&amp;',
1146         xml_str)
1147
1148
1149 def setproctitle(title):
1150     assert isinstance(title, compat_str)
1151     try:
1152         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1153     except OSError:
1154         return
1155     title_bytes = title.encode('utf-8')
1156     buf = ctypes.create_string_buffer(len(title_bytes))
1157     buf.value = title_bytes
1158     try:
1159         libc.prctl(15, buf, 0, 0, 0)
1160     except AttributeError:
1161         return  # Strange libc, just skip this
1162
1163
1164 def remove_start(s, start):
1165     if s.startswith(start):
1166         return s[len(start):]
1167     return s
1168
1169
1170 def url_basename(url):
1171     path = compat_urlparse.urlparse(url).path
1172     return path.strip(u'/').split(u'/')[-1]
1173
1174
1175 class HEADRequest(compat_urllib_request.Request):
1176     def get_method(self):
1177         return "HEAD"
1178
1179
1180 def int_or_none(v, scale=1):
1181     return v if v is None else (int(v) // scale)
1182
1183
1184 def parse_duration(s):
1185     if s is None:
1186         return None
1187
1188     m = re.match(
1189         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1190     if not m:
1191         return None
1192     res = int(m.group('secs'))
1193     if m.group('mins'):
1194         res += int(m.group('mins')) * 60
1195         if m.group('hours'):
1196             res += int(m.group('hours')) * 60 * 60
1197     return res
1198
1199
1200 def prepend_extension(filename, ext):
1201     name, real_ext = os.path.splitext(filename)
1202     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1203
1204
1205 def check_executable(exe, args=[]):
1206     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1207     args can be a list of arguments for a short output (like -version) """
1208     try:
1209         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1210     except OSError:
1211         return False
1212     return exe
1213
1214
1215 class PagedList(object):
1216     def __init__(self, pagefunc, pagesize):
1217         self._pagefunc = pagefunc
1218         self._pagesize = pagesize
1219
1220     def __len__(self):
1221         # This is only useful for tests
1222         return len(self.getslice())
1223
1224     def getslice(self, start=0, end=None):
1225         res = []
1226         for pagenum in itertools.count(start // self._pagesize):
1227             firstid = pagenum * self._pagesize
1228             nextfirstid = pagenum * self._pagesize + self._pagesize
1229             if start >= nextfirstid:
1230                 continue
1231
1232             page_results = list(self._pagefunc(pagenum))
1233
1234             startv = (
1235                 start % self._pagesize
1236                 if firstid <= start < nextfirstid
1237                 else 0)
1238
1239             endv = (
1240                 ((end - 1) % self._pagesize) + 1
1241                 if (end is not None and firstid <= end <= nextfirstid)
1242                 else None)
1243
1244             if startv != 0 or endv is not None:
1245                 page_results = page_results[startv:endv]
1246             res.extend(page_results)
1247
1248             # A little optimization - if current page is not "full", ie. does
1249             # not contain page_size videos then we can assume that this page
1250             # is the last one - there are no more ids on further pages -
1251             # i.e. no need to query again.
1252             if len(page_results) + startv < self._pagesize:
1253                 break
1254
1255             # If we got the whole page, but the next page is not interesting,
1256             # break out early as well
1257             if end == nextfirstid:
1258                 break
1259         return res
1260
1261
1262 def uppercase_escape(s):
1263     return re.sub(
1264         r'\\U([0-9a-fA-F]{8})',
1265         lambda m: compat_chr(int(m.group(1), base=16)), s)
1266
1267 try:
1268     struct.pack(u'!I', 0)
1269 except TypeError:
1270     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1271     def struct_pack(spec, *args):
1272         if isinstance(spec, compat_str):
1273             spec = spec.encode('ascii')
1274         return struct.pack(spec, *args)
1275
1276     def struct_unpack(spec, *args):
1277         if isinstance(spec, compat_str):
1278             spec = spec.encode('ascii')
1279         return struct.unpack(spec, *args)
1280 else:
1281     struct_pack = struct.pack
1282     struct_unpack = struct.unpack
1283
1284
1285 def read_batch_urls(batch_fd):
1286     def fixup(url):
1287         if not isinstance(url, compat_str):
1288             url = url.decode('utf-8', 'replace')
1289         BOM_UTF8 = u'\xef\xbb\xbf'
1290         if url.startswith(BOM_UTF8):
1291             url = url[len(BOM_UTF8):]
1292         url = url.strip()
1293         if url.startswith(('#', ';', ']')):
1294             return False
1295         return url
1296
1297     with contextlib.closing(batch_fd) as fd:
1298         return [url for url in map(fixup, fd) if url]
1299
1300
1301 def urlencode_postdata(*args, **kargs):
1302     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1303
1304
1305 def parse_xml(s):
1306     class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1307         def doctype(self, name, pubid, system):
1308             pass  # Ignore doctypes
1309
1310     parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1311     kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1312     return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1313
1314
1315 if sys.version_info < (3, 0) and sys.platform == 'win32':
1316     def compat_getpass(prompt, *args, **kwargs):
1317         if isinstance(prompt, compat_str):
1318             prompt = prompt.encode(preferredencoding())
1319         return getpass.getpass(prompt, *args, **kwargs)
1320 else:
1321     compat_getpass = getpass.getpass
1322
1323
1324 US_RATINGS = {
1325     'G': 0,
1326     'PG': 10,
1327     'PG-13': 13,
1328     'R': 16,
1329     'NC': 18,
1330 }