youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import ctypes
   5 import datetime
   6 import email.utils
   7 import errno
   8 import gzip
   9 import itertools
  10 import io
  11 import json
  12 import locale
  13 import math
  14 import os
  15 import pipes
  16 import platform
  17 import re
  18 import ssl
  19 import socket
  20 import struct
  21 import subprocess
  22 import sys
  23 import traceback
  24 import zlib
  25
  26 try:
  27     import urllib.request as compat_urllib_request
  28 except ImportError: # Python 2
  29     import urllib2 as compat_urllib_request
  30
  31 try:
  32     import urllib.error as compat_urllib_error
  33 except ImportError: # Python 2
  34     import urllib2 as compat_urllib_error
  35
  36 try:
  37     import urllib.parse as compat_urllib_parse
  38 except ImportError: # Python 2
  39     import urllib as compat_urllib_parse
  40
  41 try:
  42     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  43 except ImportError: # Python 2
  44     from urlparse import urlparse as compat_urllib_parse_urlparse
  45
  46 try:
  47     import urllib.parse as compat_urlparse
  48 except ImportError: # Python 2
  49     import urlparse as compat_urlparse
  50
  51 try:
  52     import http.cookiejar as compat_cookiejar
  53 except ImportError: # Python 2
  54     import cookielib as compat_cookiejar
  55
  56 try:
  57     import html.entities as compat_html_entities
  58 except ImportError: # Python 2
  59     import htmlentitydefs as compat_html_entities
  60
  61 try:
  62     import html.parser as compat_html_parser
  63 except ImportError: # Python 2
  64     import HTMLParser as compat_html_parser
  65
  66 try:
  67     import http.client as compat_http_client
  68 except ImportError: # Python 2
  69     import httplib as compat_http_client
  70
  71 try:
  72     from urllib.error import HTTPError as compat_HTTPError
  73 except ImportError:  # Python 2
  74     from urllib2 import HTTPError as compat_HTTPError
  75
  76 try:
  77     from urllib.request import urlretrieve as compat_urlretrieve
  78 except ImportError:  # Python 2
  79     from urllib import urlretrieve as compat_urlretrieve
  80
  81
  82 try:
  83     from subprocess import DEVNULL
  84     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  85 except ImportError:
  86     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  87
  88 try:
  89     from urllib.parse import parse_qs as compat_parse_qs
  90 except ImportError: # Python 2
  91     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  92     # Python 2's version is apparently totally broken
  93     def _unquote(string, encoding='utf-8', errors='replace'):
  94         if string == '':
  95             return string
  96         res = string.split('%')
  97         if len(res) == 1:
  98             return string
  99         if encoding is None:
 100             encoding = 'utf-8'
 101         if errors is None:
 102             errors = 'replace'
 103         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
 104         pct_sequence = b''
 105         string = res[0]
 106         for item in res[1:]:
 107             try:
 108                 if not item:
 109                     raise ValueError
 110                 pct_sequence += item[:2].decode('hex')
 111                 rest = item[2:]
 112                 if not rest:
 113                     # This segment was just a single percent-encoded character.
 114                     # May be part of a sequence of code units, so delay decoding.
 115                     # (Stored in pct_sequence).
 116                     continue
 117             except ValueError:
 118                 rest = '%' + item
 119             # Encountered non-percent-encoded characters. Flush the current
 120             # pct_sequence.
 121             string += pct_sequence.decode(encoding, errors) + rest
 122             pct_sequence = b''
 123         if pct_sequence:
 124             # Flush the final pct_sequence
 125             string += pct_sequence.decode(encoding, errors)
 126         return string
 127
 128     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 129                 encoding='utf-8', errors='replace'):
 130         qs, _coerce_result = qs, unicode
 131         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 132         r = []
 133         for name_value in pairs:
 134             if not name_value and not strict_parsing:
 135                 continue
 136             nv = name_value.split('=', 1)
 137             if len(nv) != 2:
 138                 if strict_parsing:
 139                     raise ValueError("bad query field: %r" % (name_value,))
 140                 # Handle case of a control-name with no equal sign
 141                 if keep_blank_values:
 142                     nv.append('')
 143                 else:
 144                     continue
 145             if len(nv[1]) or keep_blank_values:
 146                 name = nv[0].replace('+', ' ')
 147                 name = _unquote(name, encoding=encoding, errors=errors)
 148                 name = _coerce_result(name)
 149                 value = nv[1].replace('+', ' ')
 150                 value = _unquote(value, encoding=encoding, errors=errors)
 151                 value = _coerce_result(value)
 152                 r.append((name, value))
 153         return r
 154
 155     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 156                 encoding='utf-8', errors='replace'):
 157         parsed_result = {}
 158         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 159                         encoding=encoding, errors=errors)
 160         for name, value in pairs:
 161             if name in parsed_result:
 162                 parsed_result[name].append(value)
 163             else:
 164                 parsed_result[name] = [value]
 165         return parsed_result
 166
 167 try:
 168     compat_str = unicode # Python 2
 169 except NameError:
 170     compat_str = str
 171
 172 try:
 173     compat_chr = unichr # Python 2
 174 except NameError:
 175     compat_chr = chr
 176
 177 try:
 178     from xml.etree.ElementTree import ParseError as compat_xml_parse_error
 179 except ImportError:  # Python 2.6
 180     from xml.parsers.expat import ExpatError as compat_xml_parse_error
 181
 182 def compat_ord(c):
 183     if type(c) is int: return c
 184     else: return ord(c)
 185
 186 # This is not clearly defined otherwise
 187 compiled_regex_type = type(re.compile(''))
 188
 189 std_headers = {
 190     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
 191     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 192     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 193     'Accept-Encoding': 'gzip, deflate',
 194     'Accept-Language': 'en-us,en;q=0.5',
 195 }
 196
 197 def preferredencoding():
 198     """Get preferred encoding.
 199
 200     Returns the best encoding scheme for the system, based on
 201     locale.getpreferredencoding() and some further tweaks.
 202     """
 203     try:
 204         pref = locale.getpreferredencoding()
 205         u'TEST'.encode(pref)
 206     except:
 207         pref = 'UTF-8'
 208
 209     return pref
 210
 211 if sys.version_info < (3,0):
 212     def compat_print(s):
 213         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 214 else:
 215     def compat_print(s):
 216         assert type(s) == type(u'')
 217         print(s)
 218
 219 # In Python 2.x, json.dump expects a bytestream.
 220 # In Python 3.x, it writes to a character stream
 221 if sys.version_info < (3,0):
 222     def write_json_file(obj, fn):
 223         with open(fn, 'wb') as f:
 224             json.dump(obj, f)
 225 else:
 226     def write_json_file(obj, fn):
 227         with open(fn, 'w', encoding='utf-8') as f:
 228             json.dump(obj, f)
 229
 230 if sys.version_info >= (2,7):
 231     def find_xpath_attr(node, xpath, key, val):
 232         """ Find the xpath xpath[@key=val] """
 233         assert re.match(r'^[a-zA-Z]+$', key)
 234         assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
 235         expr = xpath + u"[@%s='%s']" % (key, val)
 236         return node.find(expr)
 237 else:
 238     def find_xpath_attr(node, xpath, key, val):
 239         for f in node.findall(xpath):
 240             if f.attrib.get(key) == val:
 241                 return f
 242         return None
 243
 244 # On python2.6 the xml.etree.ElementTree.Element methods don't support
 245 # the namespace parameter
 246 def xpath_with_ns(path, ns_map):
 247     components = [c.split(':') for c in path.split('/')]
 248     replaced = []
 249     for c in components:
 250         if len(c) == 1:
 251             replaced.append(c[0])
 252         else:
 253             ns, tag = c
 254             replaced.append('{%s}%s' % (ns_map[ns], tag))
 255     return '/'.join(replaced)
 256
 257 def htmlentity_transform(matchobj):
 258     """Transforms an HTML entity to a character.
 259
 260     This function receives a match object and is intended to be used with
 261     the re.sub() function.
 262     """
 263     entity = matchobj.group(1)
 264
 265     # Known non-numeric HTML entity
 266     if entity in compat_html_entities.name2codepoint:
 267         return compat_chr(compat_html_entities.name2codepoint[entity])
 268
 269     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 270     if mobj is not None:
 271         numstr = mobj.group(1)
 272         if numstr.startswith(u'x'):
 273             base = 16
 274             numstr = u'0%s' % numstr
 275         else:
 276             base = 10
 277         return compat_chr(int(numstr, base))
 278
 279     # Unknown entity in name, return its literal representation
 280     return (u'&%s;' % entity)
 281
 282 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 283 class BaseHTMLParser(compat_html_parser.HTMLParser):
 284     def __init(self):
 285         compat_html_parser.HTMLParser.__init__(self)
 286         self.html = None
 287
 288     def loads(self, html):
 289         self.html = html
 290         self.feed(html)
 291         self.close()
 292
 293 class AttrParser(BaseHTMLParser):
 294     """Modified HTMLParser that isolates a tag with the specified attribute"""
 295     def __init__(self, attribute, value):
 296         self.attribute = attribute
 297         self.value = value
 298         self.result = None
 299         self.started = False
 300         self.depth = {}
 301         self.watch_startpos = False
 302         self.error_count = 0
 303         BaseHTMLParser.__init__(self)
 304
 305     def error(self, message):
 306         if self.error_count > 10 or self.started:
 307             raise compat_html_parser.HTMLParseError(message, self.getpos())
 308         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 309         self.error_count += 1
 310         self.goahead(1)
 311
 312     def handle_starttag(self, tag, attrs):
 313         attrs = dict(attrs)
 314         if self.started:
 315             self.find_startpos(None)
 316         if self.attribute in attrs and attrs[self.attribute] == self.value:
 317             self.result = [tag]
 318             self.started = True
 319             self.watch_startpos = True
 320         if self.started:
 321             if not tag in self.depth: self.depth[tag] = 0
 322             self.depth[tag] += 1
 323
 324     def handle_endtag(self, tag):
 325         if self.started:
 326             if tag in self.depth: self.depth[tag] -= 1
 327             if self.depth[self.result[0]] == 0:
 328                 self.started = False
 329                 self.result.append(self.getpos())
 330
 331     def find_startpos(self, x):
 332         """Needed to put the start position of the result (self.result[1])
 333         after the opening tag with the requested id"""
 334         if self.watch_startpos:
 335             self.watch_startpos = False
 336             self.result.append(self.getpos())
 337     handle_entityref = handle_charref = handle_data = handle_comment = \
 338     handle_decl = handle_pi = unknown_decl = find_startpos
 339
 340     def get_result(self):
 341         if self.result is None:
 342             return None
 343         if len(self.result) != 3:
 344             return None
 345         lines = self.html.split('\n')
 346         lines = lines[self.result[1][0]-1:self.result[2][0]]
 347         lines[0] = lines[0][self.result[1][1]:]
 348         if len(lines) == 1:
 349             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 350         lines[-1] = lines[-1][:self.result[2][1]]
 351         return '\n'.join(lines).strip()
 352 # Hack for https://github.com/rg3/youtube-dl/issues/662
 353 if sys.version_info < (2, 7, 3):
 354     AttrParser.parse_endtag = (lambda self, i:
 355         i + len("</scr'+'ipt>")
 356         if self.rawdata[i:].startswith("</scr'+'ipt>")
 357         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 358
 359 def get_element_by_id(id, html):
 360     """Return the content of the tag with the specified ID in the passed HTML document"""
 361     return get_element_by_attribute("id", id, html)
 362
 363 def get_element_by_attribute(attribute, value, html):
 364     """Return the content of the tag with the specified attribute in the passed HTML document"""
 365     parser = AttrParser(attribute, value)
 366     try:
 367         parser.loads(html)
 368     except compat_html_parser.HTMLParseError:
 369         pass
 370     return parser.get_result()
 371
 372 class MetaParser(BaseHTMLParser):
 373     """
 374     Modified HTMLParser that isolates a meta tag with the specified name
 375     attribute.
 376     """
 377     def __init__(self, name):
 378         BaseHTMLParser.__init__(self)
 379         self.name = name
 380         self.content = None
 381         self.result = None
 382
 383     def handle_starttag(self, tag, attrs):
 384         if tag != 'meta':
 385             return
 386         attrs = dict(attrs)
 387         if attrs.get('name') == self.name:
 388             self.result = attrs.get('content')
 389
 390     def get_result(self):
 391         return self.result
 392
 393 def get_meta_content(name, html):
 394     """
 395     Return the content attribute from the meta tag with the given name attribute.
 396     """
 397     parser = MetaParser(name)
 398     try:
 399         parser.loads(html)
 400     except compat_html_parser.HTMLParseError:
 401         pass
 402     return parser.get_result()
 403
 404
 405 def clean_html(html):
 406     """Clean an HTML snippet into a readable string"""
 407     # Newline vs <br />
 408     html = html.replace('\n', ' ')
 409     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 410     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 411     # Strip html tags
 412     html = re.sub('<.*?>', '', html)
 413     # Replace html entities
 414     html = unescapeHTML(html)
 415     return html.strip()
 416
 417
 418 def sanitize_open(filename, open_mode):
 419     """Try to open the given filename, and slightly tweak it if this fails.
 420
 421     Attempts to open the given filename. If this fails, it tries to change
 422     the filename slightly, step by step, until it's either able to open it
 423     or it fails and raises a final exception, like the standard open()
 424     function.
 425
 426     It returns the tuple (stream, definitive_file_name).
 427     """
 428     try:
 429         if filename == u'-':
 430             if sys.platform == 'win32':
 431                 import msvcrt
 432                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 433             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 434         stream = open(encodeFilename(filename), open_mode)
 435         return (stream, filename)
 436     except (IOError, OSError) as err:
 437         if err.errno in (errno.EACCES,):
 438             raise
 439
 440         # In case of error, try to remove win32 forbidden chars
 441         alt_filename = os.path.join(
 442                         re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
 443                         for path_part in os.path.split(filename)
 444                        )
 445         if alt_filename == filename:
 446             raise
 447         else:
 448             # An exception here should be caught in the caller
 449             stream = open(encodeFilename(filename), open_mode)
 450             return (stream, alt_filename)
 451
 452
 453 def timeconvert(timestr):
 454     """Convert RFC 2822 defined time string into system timestamp"""
 455     timestamp = None
 456     timetuple = email.utils.parsedate_tz(timestr)
 457     if timetuple is not None:
 458         timestamp = email.utils.mktime_tz(timetuple)
 459     return timestamp
 460
 461 def sanitize_filename(s, restricted=False, is_id=False):
 462     """Sanitizes a string so it could be used as part of a filename.
 463     If restricted is set, use a stricter subset of allowed characters.
 464     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 465     """
 466     def replace_insane(char):
 467         if char == '?' or ord(char) < 32 or ord(char) == 127:
 468             return ''
 469         elif char == '"':
 470             return '' if restricted else '\''
 471         elif char == ':':
 472             return '_-' if restricted else ' -'
 473         elif char in '\\/|*<>':
 474             return '_'
 475         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 476             return '_'
 477         if restricted and ord(char) > 127:
 478             return '_'
 479         return char
 480
 481     result = u''.join(map(replace_insane, s))
 482     if not is_id:
 483         while '__' in result:
 484             result = result.replace('__', '_')
 485         result = result.strip('_')
 486         # Common case of "Foreign band name - English song title"
 487         if restricted and result.startswith('-_'):
 488             result = result[2:]
 489         if not result:
 490             result = '_'
 491     return result
 492
 493 def orderedSet(iterable):
 494     """ Remove all duplicates from the input iterable """
 495     res = []
 496     for el in iterable:
 497         if el not in res:
 498             res.append(el)
 499     return res
 500
 501 def unescapeHTML(s):
 502     """
 503     @param s a string
 504     """
 505     assert type(s) == type(u'')
 506
 507     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 508     return result
 509
 510
 511 def encodeFilename(s, for_subprocess=False):
 512     """
 513     @param s The name of the file
 514     """
 515
 516     assert type(s) == compat_str
 517
 518     # Python 3 has a Unicode API
 519     if sys.version_info >= (3, 0):
 520         return s
 521
 522     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 523         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 524         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 525         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 526         if not for_subprocess:
 527             return s
 528         else:
 529             # For subprocess calls, encode with locale encoding
 530             # Refer to http://stackoverflow.com/a/9951851/35070
 531             encoding = preferredencoding()
 532     else:
 533         encoding = sys.getfilesystemencoding()
 534     if encoding is None:
 535         encoding = 'utf-8'
 536     return s.encode(encoding, 'ignore')
 537
 538
 539 def decodeOption(optval):
 540     if optval is None:
 541         return optval
 542     if isinstance(optval, bytes):
 543         optval = optval.decode(preferredencoding())
 544
 545     assert isinstance(optval, compat_str)
 546     return optval
 547
 548 def formatSeconds(secs):
 549     if secs > 3600:
 550         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 551     elif secs > 60:
 552         return '%d:%02d' % (secs // 60, secs % 60)
 553     else:
 554         return '%d' % secs
 555
 556
 557 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
 558     if sys.version_info < (3, 2):
 559         import httplib
 560
 561         class HTTPSConnectionV3(httplib.HTTPSConnection):
 562             def __init__(self, *args, **kwargs):
 563                 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
 564
 565             def connect(self):
 566                 sock = socket.create_connection((self.host, self.port), self.timeout)
 567                 if getattr(self, '_tunnel_host', False):
 568                     self.sock = sock
 569                     self._tunnel()
 570                 try:
 571                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
 572                 except ssl.SSLError:
 573                     self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
 574
 575         class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
 576             def https_open(self, req):
 577                 return self.do_open(HTTPSConnectionV3, req)
 578         return HTTPSHandlerV3(**kwargs)
 579     else:
 580         context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
 581         context.verify_mode = (ssl.CERT_NONE
 582                                if opts_no_check_certificate
 583                                else ssl.CERT_REQUIRED)
 584         context.set_default_verify_paths()
 585         try:
 586             context.load_default_certs()
 587         except AttributeError:
 588             pass  # Python < 3.4
 589         return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
 590
 591 class ExtractorError(Exception):
 592     """Error during info extraction."""
 593     def __init__(self, msg, tb=None, expected=False, cause=None):
 594         """ tb, if given, is the original traceback (so that it can be printed out).
 595         If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
 596         """
 597
 598         if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
 599             expected = True
 600         if not expected:
 601             msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type  youtube-dl -U  to update.'
 602         super(ExtractorError, self).__init__(msg)
 603
 604         self.traceback = tb
 605         self.exc_info = sys.exc_info()  # preserve original exception
 606         self.cause = cause
 607
 608     def format_traceback(self):
 609         if self.traceback is None:
 610             return None
 611         return u''.join(traceback.format_tb(self.traceback))
 612
 613
 614 class RegexNotFoundError(ExtractorError):
 615     """Error when a regex didn't match"""
 616     pass
 617
 618
 619 class DownloadError(Exception):
 620     """Download Error exception.
 621
 622     This exception may be thrown by FileDownloader objects if they are not
 623     configured to continue on errors. They will contain the appropriate
 624     error message.
 625     """
 626     def __init__(self, msg, exc_info=None):
 627         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 628         super(DownloadError, self).__init__(msg)
 629         self.exc_info = exc_info
 630
 631
 632 class SameFileError(Exception):
 633     """Same File exception.
 634
 635     This exception will be thrown by FileDownloader objects if they detect
 636     multiple files would have to be downloaded to the same file on disk.
 637     """
 638     pass
 639
 640
 641 class PostProcessingError(Exception):
 642     """Post Processing exception.
 643
 644     This exception may be raised by PostProcessor's .run() method to
 645     indicate an error in the postprocessing task.
 646     """
 647     def __init__(self, msg):
 648         self.msg = msg
 649
 650 class MaxDownloadsReached(Exception):
 651     """ --max-downloads limit has been reached. """
 652     pass
 653
 654
 655 class UnavailableVideoError(Exception):
 656     """Unavailable Format exception.
 657
 658     This exception will be thrown when a video is requested
 659     in a format that is not available for that video.
 660     """
 661     pass
 662
 663
 664 class ContentTooShortError(Exception):
 665     """Content Too Short exception.
 666
 667     This exception may be raised by FileDownloader objects when a file they
 668     download is too small for what the server announced first, indicating
 669     the connection was probably interrupted.
 670     """
 671     # Both in bytes
 672     downloaded = None
 673     expected = None
 674
 675     def __init__(self, downloaded, expected):
 676         self.downloaded = downloaded
 677         self.expected = expected
 678
 679 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 680     """Handler for HTTP requests and responses.
 681
 682     This class, when installed with an OpenerDirector, automatically adds
 683     the standard headers to every HTTP request and handles gzipped and
 684     deflated responses from web servers. If compression is to be avoided in
 685     a particular request, the original request in the program code only has
 686     to include the HTTP header "Youtubedl-No-Compression", which will be
 687     removed before making the real request.
 688
 689     Part of this code was copied from:
 690
 691     http://techknack.net/python-urllib2-handlers/
 692
 693     Andrew Rowls, the author of that code, agreed to release it to the
 694     public domain.
 695     """
 696
 697     @staticmethod
 698     def deflate(data):
 699         try:
 700             return zlib.decompress(data, -zlib.MAX_WBITS)
 701         except zlib.error:
 702             return zlib.decompress(data)
 703
 704     @staticmethod
 705     def addinfourl_wrapper(stream, headers, url, code):
 706         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 707             return compat_urllib_request.addinfourl(stream, headers, url, code)
 708         ret = compat_urllib_request.addinfourl(stream, headers, url)
 709         ret.code = code
 710         return ret
 711
 712     def http_request(self, req):
 713         for h,v in std_headers.items():
 714             if h in req.headers:
 715                 del req.headers[h]
 716             req.add_header(h, v)
 717         if 'Youtubedl-no-compression' in req.headers:
 718             if 'Accept-encoding' in req.headers:
 719                 del req.headers['Accept-encoding']
 720             del req.headers['Youtubedl-no-compression']
 721         if 'Youtubedl-user-agent' in req.headers:
 722             if 'User-agent' in req.headers:
 723                 del req.headers['User-agent']
 724             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 725             del req.headers['Youtubedl-user-agent']
 726         return req
 727
 728     def http_response(self, req, resp):
 729         old_resp = resp
 730         # gzip
 731         if resp.headers.get('Content-encoding', '') == 'gzip':
 732             content = resp.read()
 733             gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
 734             try:
 735                 uncompressed = io.BytesIO(gz.read())
 736             except IOError as original_ioerror:
 737                 # There may be junk add the end of the file
 738                 # See http://stackoverflow.com/q/4928560/35070 for details
 739                 for i in range(1, 1024):
 740                     try:
 741                         gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
 742                         uncompressed = io.BytesIO(gz.read())
 743                     except IOError:
 744                         continue
 745                     break
 746                 else:
 747                     raise original_ioerror
 748             resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
 749             resp.msg = old_resp.msg
 750         # deflate
 751         if resp.headers.get('Content-encoding', '') == 'deflate':
 752             gz = io.BytesIO(self.deflate(resp.read()))
 753             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 754             resp.msg = old_resp.msg
 755         return resp
 756
 757     https_request = http_request
 758     https_response = http_response
 759
 760
 761 def unified_strdate(date_str):
 762     """Return a string with the date in the format YYYYMMDD"""
 763     upload_date = None
 764     #Replace commas
 765     date_str = date_str.replace(',', ' ')
 766     # %z (UTC offset) is only supported in python>=3.2
 767     date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
 768     format_expressions = [
 769         '%d %B %Y',
 770         '%d %b %Y',
 771         '%B %d %Y',
 772         '%b %d %Y',
 773         '%Y-%m-%d',
 774         '%d/%m/%Y',
 775         '%Y/%m/%d %H:%M:%S',
 776         '%Y-%m-%d %H:%M:%S',
 777         '%d.%m.%Y %H:%M',
 778         '%Y-%m-%dT%H:%M:%SZ',
 779         '%Y-%m-%dT%H:%M:%S.%fZ',
 780         '%Y-%m-%dT%H:%M:%S.%f0Z',
 781         '%Y-%m-%dT%H:%M:%S',
 782         '%Y-%m-%dT%H:%M:%S.%f',
 783         '%Y-%m-%dT%H:%M',
 784     ]
 785     for expression in format_expressions:
 786         try:
 787             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 788         except ValueError:
 789             pass
 790     if upload_date is None:
 791         timetuple = email.utils.parsedate_tz(date_str)
 792         if timetuple:
 793             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
 794     return upload_date
 795
 796 def determine_ext(url, default_ext=u'unknown_video'):
 797     guess = url.partition(u'?')[0].rpartition(u'.')[2]
 798     if re.match(r'^[A-Za-z0-9]+$', guess):
 799         return guess
 800     else:
 801         return default_ext
 802
 803 def subtitles_filename(filename, sub_lang, sub_format):
 804     return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 805
 806 def date_from_str(date_str):
 807     """
 808     Return a datetime object from a string in the format YYYYMMDD or
 809     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 810     today = datetime.date.today()
 811     if date_str == 'now'or date_str == 'today':
 812         return today
 813     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 814     if match is not None:
 815         sign = match.group('sign')
 816         time = int(match.group('time'))
 817         if sign == '-':
 818             time = -time
 819         unit = match.group('unit')
 820         #A bad aproximation?
 821         if unit == 'month':
 822             unit = 'day'
 823             time *= 30
 824         elif unit == 'year':
 825             unit = 'day'
 826             time *= 365
 827         unit += 's'
 828         delta = datetime.timedelta(**{unit: time})
 829         return today + delta
 830     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 831
 832 def hyphenate_date(date_str):
 833     """
 834     Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
 835     match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
 836     if match is not None:
 837         return '-'.join(match.groups())
 838     else:
 839         return date_str
 840
 841 class DateRange(object):
 842     """Represents a time interval between two dates"""
 843     def __init__(self, start=None, end=None):
 844         """start and end must be strings in the format accepted by date"""
 845         if start is not None:
 846             self.start = date_from_str(start)
 847         else:
 848             self.start = datetime.datetime.min.date()
 849         if end is not None:
 850             self.end = date_from_str(end)
 851         else:
 852             self.end = datetime.datetime.max.date()
 853         if self.start > self.end:
 854             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 855     @classmethod
 856     def day(cls, day):
 857         """Returns a range that only contains the given day"""
 858         return cls(day,day)
 859     def __contains__(self, date):
 860         """Check if the date is in the range"""
 861         if not isinstance(date, datetime.date):
 862             date = date_from_str(date)
 863         return self.start <= date <= self.end
 864     def __str__(self):
 865         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
 866
 867
 868 def platform_name():
 869     """ Returns the platform name as a compat_str """
 870     res = platform.platform()
 871     if isinstance(res, bytes):
 872         res = res.decode(preferredencoding())
 873
 874     assert isinstance(res, compat_str)
 875     return res
 876
 877
 878 def write_string(s, out=None):
 879     if out is None:
 880         out = sys.stderr
 881     assert type(s) == compat_str
 882
 883     if ('b' in getattr(out, 'mode', '') or
 884             sys.version_info[0] < 3):  # Python 2 lies about mode of sys.stderr
 885         s = s.encode(preferredencoding(), 'ignore')
 886     try:
 887         out.write(s)
 888     except UnicodeEncodeError:
 889         # In Windows shells, this can fail even when the codec is just charmap!?
 890         # See https://wiki.python.org/moin/PrintFails#Issue
 891         if sys.platform == 'win32' and hasattr(out, 'encoding'):
 892             s = s.encode(out.encoding, 'ignore').decode(out.encoding)
 893             out.write(s)
 894         else:
 895             raise
 896
 897     out.flush()
 898
 899
 900 def bytes_to_intlist(bs):
 901     if not bs:
 902         return []
 903     if isinstance(bs[0], int):  # Python 3
 904         return list(bs)
 905     else:
 906         return [ord(c) for c in bs]
 907
 908
 909 def intlist_to_bytes(xs):
 910     if not xs:
 911         return b''
 912     if isinstance(chr(0), bytes):  # Python 2
 913         return ''.join([chr(x) for x in xs])
 914     else:
 915         return bytes(xs)
 916
 917
 918 def get_cachedir(params={}):
 919     cache_root = os.environ.get('XDG_CACHE_HOME',
 920                                 os.path.expanduser('~/.cache'))
 921     return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
 922
 923
 924 # Cross-platform file locking
 925 if sys.platform == 'win32':
 926     import ctypes.wintypes
 927     import msvcrt
 928
 929     class OVERLAPPED(ctypes.Structure):
 930         _fields_ = [
 931             ('Internal', ctypes.wintypes.LPVOID),
 932             ('InternalHigh', ctypes.wintypes.LPVOID),
 933             ('Offset', ctypes.wintypes.DWORD),
 934             ('OffsetHigh', ctypes.wintypes.DWORD),
 935             ('hEvent', ctypes.wintypes.HANDLE),
 936         ]
 937
 938     kernel32 = ctypes.windll.kernel32
 939     LockFileEx = kernel32.LockFileEx
 940     LockFileEx.argtypes = [
 941         ctypes.wintypes.HANDLE,     # hFile
 942         ctypes.wintypes.DWORD,      # dwFlags
 943         ctypes.wintypes.DWORD,      # dwReserved
 944         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 945         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 946         ctypes.POINTER(OVERLAPPED)  # Overlapped
 947     ]
 948     LockFileEx.restype = ctypes.wintypes.BOOL
 949     UnlockFileEx = kernel32.UnlockFileEx
 950     UnlockFileEx.argtypes = [
 951         ctypes.wintypes.HANDLE,     # hFile
 952         ctypes.wintypes.DWORD,      # dwReserved
 953         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockLow
 954         ctypes.wintypes.DWORD,      # nNumberOfBytesToLockHigh
 955         ctypes.POINTER(OVERLAPPED)  # Overlapped
 956     ]
 957     UnlockFileEx.restype = ctypes.wintypes.BOOL
 958     whole_low = 0xffffffff
 959     whole_high = 0x7fffffff
 960
 961     def _lock_file(f, exclusive):
 962         overlapped = OVERLAPPED()
 963         overlapped.Offset = 0
 964         overlapped.OffsetHigh = 0
 965         overlapped.hEvent = 0
 966         f._lock_file_overlapped_p = ctypes.pointer(overlapped)
 967         handle = msvcrt.get_osfhandle(f.fileno())
 968         if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
 969                           whole_low, whole_high, f._lock_file_overlapped_p):
 970             raise OSError('Locking file failed: %r' % ctypes.FormatError())
 971
 972     def _unlock_file(f):
 973         assert f._lock_file_overlapped_p
 974         handle = msvcrt.get_osfhandle(f.fileno())
 975         if not UnlockFileEx(handle, 0,
 976                             whole_low, whole_high, f._lock_file_overlapped_p):
 977             raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
 978
 979 else:
 980     import fcntl
 981
 982     def _lock_file(f, exclusive):
 983         fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
 984
 985     def _unlock_file(f):
 986         fcntl.lockf(f, fcntl.LOCK_UN)
 987
 988
 989 class locked_file(object):
 990     def __init__(self, filename, mode, encoding=None):
 991         assert mode in ['r', 'a', 'w']
 992         self.f = io.open(filename, mode, encoding=encoding)
 993         self.mode = mode
 994
 995     def __enter__(self):
 996         exclusive = self.mode != 'r'
 997         try:
 998             _lock_file(self.f, exclusive)
 999         except IOError:
1000             self.f.close()
1001             raise
1002         return self
1003
1004     def __exit__(self, etype, value, traceback):
1005         try:
1006             _unlock_file(self.f)
1007         finally:
1008             self.f.close()
1009
1010     def __iter__(self):
1011         return iter(self.f)
1012
1013     def write(self, *args):
1014         return self.f.write(*args)
1015
1016     def read(self, *args):
1017         return self.f.read(*args)
1018
1019
1020 def shell_quote(args):
1021     quoted_args = []
1022     encoding = sys.getfilesystemencoding()
1023     if encoding is None:
1024         encoding = 'utf-8'
1025     for a in args:
1026         if isinstance(a, bytes):
1027             # We may get a filename encoded with 'encodeFilename'
1028             a = a.decode(encoding)
1029         quoted_args.append(pipes.quote(a))
1030     return u' '.join(quoted_args)
1031
1032
1033 def takewhile_inclusive(pred, seq):
1034     """ Like itertools.takewhile, but include the latest evaluated element
1035         (the first element so that Not pred(e)) """
1036     for e in seq:
1037         yield e
1038         if not pred(e):
1039             return
1040
1041
1042 def smuggle_url(url, data):
1043     """ Pass additional data in a URL for internal use. """
1044
1045     sdata = compat_urllib_parse.urlencode(
1046         {u'__youtubedl_smuggle': json.dumps(data)})
1047     return url + u'#' + sdata
1048
1049
1050 def unsmuggle_url(smug_url, default=None):
1051     if not '#__youtubedl_smuggle' in smug_url:
1052         return smug_url, default
1053     url, _, sdata = smug_url.rpartition(u'#')
1054     jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1055     data = json.loads(jsond)
1056     return url, data
1057
1058
1059 def format_bytes(bytes):
1060     if bytes is None:
1061         return u'N/A'
1062     if type(bytes) is str:
1063         bytes = float(bytes)
1064     if bytes == 0.0:
1065         exponent = 0
1066     else:
1067         exponent = int(math.log(bytes, 1024.0))
1068     suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1069     converted = float(bytes) / float(1024 ** exponent)
1070     return u'%.2f%s' % (converted, suffix)
1071
1072
1073 def str_to_int(int_str):
1074     int_str = re.sub(r'[,\.]', u'', int_str)
1075     return int(int_str)
1076
1077
1078 def get_term_width():
1079     columns = os.environ.get('COLUMNS', None)
1080     if columns:
1081         return int(columns)
1082
1083     try:
1084         sp = subprocess.Popen(
1085             ['stty', 'size'],
1086             stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1087         out, err = sp.communicate()
1088         return int(out.split()[1])
1089     except:
1090         pass
1091     return None
1092
1093
1094 def month_by_name(name):
1095     """ Return the number of a month by (locale-independently) English name """
1096
1097     ENGLISH_NAMES = [
1098         u'January', u'February', u'March', u'April', u'May', u'June',
1099         u'July', u'August', u'September', u'October', u'November', u'December']
1100     try:
1101         return ENGLISH_NAMES.index(name) + 1
1102     except ValueError:
1103         return None
1104
1105
1106 def fix_xml_ampersands(xml_str):
1107     """Replace all the '&' by '&amp;' in XML"""
1108     return re.sub(
1109         r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1110         u'&amp;',
1111         xml_str)
1112
1113
1114 def setproctitle(title):
1115     assert isinstance(title, compat_str)
1116     try:
1117         libc = ctypes.cdll.LoadLibrary("libc.so.6")
1118     except OSError:
1119         return
1120     title = title
1121     buf = ctypes.create_string_buffer(len(title) + 1)
1122     buf.value = title.encode('utf-8')
1123     try:
1124         libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1125     except AttributeError:
1126         return  # Strange libc, just skip this
1127
1128
1129 def remove_start(s, start):
1130     if s.startswith(start):
1131         return s[len(start):]
1132     return s
1133
1134
1135 def url_basename(url):
1136     path = compat_urlparse.urlparse(url).path
1137     return path.strip(u'/').split(u'/')[-1]
1138
1139
1140 class HEADRequest(compat_urllib_request.Request):
1141     def get_method(self):
1142         return "HEAD"
1143
1144
1145 def int_or_none(v, scale=1):
1146     return v if v is None else (int(v) // scale)
1147
1148
1149 def parse_duration(s):
1150     if s is None:
1151         return None
1152
1153     m = re.match(
1154         r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
1155     if not m:
1156         return None
1157     res = int(m.group('secs'))
1158     if m.group('mins'):
1159         res += int(m.group('mins')) * 60
1160         if m.group('hours'):
1161             res += int(m.group('hours')) * 60 * 60
1162     return res
1163
1164
1165 def prepend_extension(filename, ext):
1166     name, real_ext = os.path.splitext(filename)
1167     return u'{0}.{1}{2}'.format(name, ext, real_ext)
1168
1169
1170 def check_executable(exe, args=[]):
1171     """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1172     args can be a list of arguments for a short output (like -version) """
1173     try:
1174         subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1175     except OSError:
1176         return False
1177     return exe
1178
1179
1180 class PagedList(object):
1181     def __init__(self, pagefunc, pagesize):
1182         self._pagefunc = pagefunc
1183         self._pagesize = pagesize
1184
1185     def __len__(self):
1186         # This is only useful for tests
1187         return len(self.getslice())
1188
1189     def getslice(self, start=0, end=None):
1190         res = []
1191         for pagenum in itertools.count(start // self._pagesize):
1192             firstid = pagenum * self._pagesize
1193             nextfirstid = pagenum * self._pagesize + self._pagesize
1194             if start >= nextfirstid:
1195                 continue
1196
1197             page_results = list(self._pagefunc(pagenum))
1198
1199             startv = (
1200                 start % self._pagesize
1201                 if firstid <= start < nextfirstid
1202                 else 0)
1203
1204             endv = (
1205                 ((end - 1) % self._pagesize) + 1
1206                 if (end is not None and firstid <= end <= nextfirstid)
1207                 else None)
1208
1209             if startv != 0 or endv is not None:
1210                 page_results = page_results[startv:endv]
1211             res.extend(page_results)
1212
1213             # A little optimization - if current page is not "full", ie. does
1214             # not contain page_size videos then we can assume that this page
1215             # is the last one - there are no more ids on further pages -
1216             # i.e. no need to query again.
1217             if len(page_results) + startv < self._pagesize:
1218                 break
1219
1220             # If we got the whole page, but the next page is not interesting,
1221             # break out early as well
1222             if end == nextfirstid:
1223                 break
1224         return res
1225
1226
1227 def uppercase_escape(s):
1228     return re.sub(
1229         r'\\U([0-9a-fA-F]{8})',
1230         lambda m: compat_chr(int(m.group(1), base=16)), s)
1231
1232 try:
1233     struct.pack(u'!I', 0)
1234 except TypeError:
1235     # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1236     def struct_pack(spec, *args):
1237         if isinstance(spec, compat_str):
1238             spec = spec.encode('ascii')
1239         return struct.pack(spec, *args)
1240
1241     def struct_unpack(spec, *args):
1242         if isinstance(spec, compat_str):
1243             spec = spec.encode('ascii')
1244         return struct.unpack(spec, *args)
1245 else:
1246     struct_pack = struct.pack
1247     struct_unpack = struct.unpack