youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import json
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import traceback
  12 import zlib
  13 import email.utils
  14 import json
  15 import datetime
  16
  17 try:
  18     import urllib.request as compat_urllib_request
  19 except ImportError: # Python 2
  20     import urllib2 as compat_urllib_request
  21
  22 try:
  23     import urllib.error as compat_urllib_error
  24 except ImportError: # Python 2
  25     import urllib2 as compat_urllib_error
  26
  27 try:
  28     import urllib.parse as compat_urllib_parse
  29 except ImportError: # Python 2
  30     import urllib as compat_urllib_parse
  31
  32 try:
  33     from urllib.parse import urlparse as compat_urllib_parse_urlparse
  34 except ImportError: # Python 2
  35     from urlparse import urlparse as compat_urllib_parse_urlparse
  36
  37 try:
  38     import http.cookiejar as compat_cookiejar
  39 except ImportError: # Python 2
  40     import cookielib as compat_cookiejar
  41
  42 try:
  43     import html.entities as compat_html_entities
  44 except ImportError: # Python 2
  45     import htmlentitydefs as compat_html_entities
  46
  47 try:
  48     import html.parser as compat_html_parser
  49 except ImportError: # Python 2
  50     import HTMLParser as compat_html_parser
  51
  52 try:
  53     import http.client as compat_http_client
  54 except ImportError: # Python 2
  55     import httplib as compat_http_client
  56
  57 try:
  58     from subprocess import DEVNULL
  59     compat_subprocess_get_DEVNULL = lambda: DEVNULL
  60 except ImportError:
  61     compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
  62
  63 try:
  64     from urllib.parse import parse_qs as compat_parse_qs
  65 except ImportError: # Python 2
  66     # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
  67     # Python 2's version is apparently totally broken
  68     def _unquote(string, encoding='utf-8', errors='replace'):
  69         if string == '':
  70             return string
  71         res = string.split('%')
  72         if len(res) == 1:
  73             return string
  74         if encoding is None:
  75             encoding = 'utf-8'
  76         if errors is None:
  77             errors = 'replace'
  78         # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
  79         pct_sequence = b''
  80         string = res[0]
  81         for item in res[1:]:
  82             try:
  83                 if not item:
  84                     raise ValueError
  85                 pct_sequence += item[:2].decode('hex')
  86                 rest = item[2:]
  87                 if not rest:
  88                     # This segment was just a single percent-encoded character.
  89                     # May be part of a sequence of code units, so delay decoding.
  90                     # (Stored in pct_sequence).
  91                     continue
  92             except ValueError:
  93                 rest = '%' + item
  94             # Encountered non-percent-encoded characters. Flush the current
  95             # pct_sequence.
  96             string += pct_sequence.decode(encoding, errors) + rest
  97             pct_sequence = b''
  98         if pct_sequence:
  99             # Flush the final pct_sequence
 100             string += pct_sequence.decode(encoding, errors)
 101         return string
 102
 103     def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
 104                 encoding='utf-8', errors='replace'):
 105         qs, _coerce_result = qs, unicode
 106         pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
 107         r = []
 108         for name_value in pairs:
 109             if not name_value and not strict_parsing:
 110                 continue
 111             nv = name_value.split('=', 1)
 112             if len(nv) != 2:
 113                 if strict_parsing:
 114                     raise ValueError("bad query field: %r" % (name_value,))
 115                 # Handle case of a control-name with no equal sign
 116                 if keep_blank_values:
 117                     nv.append('')
 118                 else:
 119                     continue
 120             if len(nv[1]) or keep_blank_values:
 121                 name = nv[0].replace('+', ' ')
 122                 name = _unquote(name, encoding=encoding, errors=errors)
 123                 name = _coerce_result(name)
 124                 value = nv[1].replace('+', ' ')
 125                 value = _unquote(value, encoding=encoding, errors=errors)
 126                 value = _coerce_result(value)
 127                 r.append((name, value))
 128         return r
 129
 130     def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
 131                 encoding='utf-8', errors='replace'):
 132         parsed_result = {}
 133         pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
 134                         encoding=encoding, errors=errors)
 135         for name, value in pairs:
 136             if name in parsed_result:
 137                 parsed_result[name].append(value)
 138             else:
 139                 parsed_result[name] = [value]
 140         return parsed_result
 141
 142 try:
 143     compat_str = unicode # Python 2
 144 except NameError:
 145     compat_str = str
 146
 147 try:
 148     compat_chr = unichr # Python 2
 149 except NameError:
 150     compat_chr = chr
 151
 152 std_headers = {
 153     'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 154     'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 155     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 156     'Accept-Encoding': 'gzip, deflate',
 157     'Accept-Language': 'en-us,en;q=0.5',
 158 }
 159
 160 def preferredencoding():
 161     """Get preferred encoding.
 162
 163     Returns the best encoding scheme for the system, based on
 164     locale.getpreferredencoding() and some further tweaks.
 165     """
 166     try:
 167         pref = locale.getpreferredencoding()
 168         u'TEST'.encode(pref)
 169     except:
 170         pref = 'UTF-8'
 171
 172     return pref
 173
 174 if sys.version_info < (3,0):
 175     def compat_print(s):
 176         print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
 177 else:
 178     def compat_print(s):
 179         assert type(s) == type(u'')
 180         print(s)
 181
 182 # In Python 2.x, json.dump expects a bytestream.
 183 # In Python 3.x, it writes to a character stream
 184 if sys.version_info < (3,0):
 185     def write_json_file(obj, fn):
 186         with open(fn, 'wb') as f:
 187             json.dump(obj, f)
 188 else:
 189     def write_json_file(obj, fn):
 190         with open(fn, 'w', encoding='utf-8') as f:
 191             json.dump(obj, f)
 192
 193 def htmlentity_transform(matchobj):
 194     """Transforms an HTML entity to a character.
 195
 196     This function receives a match object and is intended to be used with
 197     the re.sub() function.
 198     """
 199     entity = matchobj.group(1)
 200
 201     # Known non-numeric HTML entity
 202     if entity in compat_html_entities.name2codepoint:
 203         return compat_chr(compat_html_entities.name2codepoint[entity])
 204
 205     mobj = re.match(u'(?u)#(x?\\d+)', entity)
 206     if mobj is not None:
 207         numstr = mobj.group(1)
 208         if numstr.startswith(u'x'):
 209             base = 16
 210             numstr = u'0%s' % numstr
 211         else:
 212             base = 10
 213         return compat_chr(int(numstr, base))
 214
 215     # Unknown entity in name, return its literal representation
 216     return (u'&%s;' % entity)
 217
 218 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 219 class AttrParser(compat_html_parser.HTMLParser):
 220     """Modified HTMLParser that isolates a tag with the specified attribute"""
 221     def __init__(self, attribute, value):
 222         self.attribute = attribute
 223         self.value = value
 224         self.result = None
 225         self.started = False
 226         self.depth = {}
 227         self.html = None
 228         self.watch_startpos = False
 229         self.error_count = 0
 230         compat_html_parser.HTMLParser.__init__(self)
 231
 232     def error(self, message):
 233         if self.error_count > 10 or self.started:
 234             raise compat_html_parser.HTMLParseError(message, self.getpos())
 235         self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 236         self.error_count += 1
 237         self.goahead(1)
 238
 239     def loads(self, html):
 240         self.html = html
 241         self.feed(html)
 242         self.close()
 243
 244     def handle_starttag(self, tag, attrs):
 245         attrs = dict(attrs)
 246         if self.started:
 247             self.find_startpos(None)
 248         if self.attribute in attrs and attrs[self.attribute] == self.value:
 249             self.result = [tag]
 250             self.started = True
 251             self.watch_startpos = True
 252         if self.started:
 253             if not tag in self.depth: self.depth[tag] = 0
 254             self.depth[tag] += 1
 255
 256     def handle_endtag(self, tag):
 257         if self.started:
 258             if tag in self.depth: self.depth[tag] -= 1
 259             if self.depth[self.result[0]] == 0:
 260                 self.started = False
 261                 self.result.append(self.getpos())
 262
 263     def find_startpos(self, x):
 264         """Needed to put the start position of the result (self.result[1])
 265         after the opening tag with the requested id"""
 266         if self.watch_startpos:
 267             self.watch_startpos = False
 268             self.result.append(self.getpos())
 269     handle_entityref = handle_charref = handle_data = handle_comment = \
 270     handle_decl = handle_pi = unknown_decl = find_startpos
 271
 272     def get_result(self):
 273         if self.result is None:
 274             return None
 275         if len(self.result) != 3:
 276             return None
 277         lines = self.html.split('\n')
 278         lines = lines[self.result[1][0]-1:self.result[2][0]]
 279         lines[0] = lines[0][self.result[1][1]:]
 280         if len(lines) == 1:
 281             lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 282         lines[-1] = lines[-1][:self.result[2][1]]
 283         return '\n'.join(lines).strip()
 284 # Hack for https://github.com/rg3/youtube-dl/issues/662
 285 if sys.version_info < (2, 7, 3):
 286     AttrParser.parse_endtag = (lambda self, i:
 287         i + len("</scr'+'ipt>")
 288         if self.rawdata[i:].startswith("</scr'+'ipt>")
 289         else compat_html_parser.HTMLParser.parse_endtag(self, i))
 290
 291 def get_element_by_id(id, html):
 292     """Return the content of the tag with the specified ID in the passed HTML document"""
 293     return get_element_by_attribute("id", id, html)
 294
 295 def get_element_by_attribute(attribute, value, html):
 296     """Return the content of the tag with the specified attribute in the passed HTML document"""
 297     parser = AttrParser(attribute, value)
 298     try:
 299         parser.loads(html)
 300     except compat_html_parser.HTMLParseError:
 301         pass
 302     return parser.get_result()
 303
 304
 305 def clean_html(html):
 306     """Clean an HTML snippet into a readable string"""
 307     # Newline vs <br />
 308     html = html.replace('\n', ' ')
 309     html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
 310     html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
 311     # Strip html tags
 312     html = re.sub('<.*?>', '', html)
 313     # Replace html entities
 314     html = unescapeHTML(html)
 315     return html.strip()
 316
 317
 318 def sanitize_open(filename, open_mode):
 319     """Try to open the given filename, and slightly tweak it if this fails.
 320
 321     Attempts to open the given filename. If this fails, it tries to change
 322     the filename slightly, step by step, until it's either able to open it
 323     or it fails and raises a final exception, like the standard open()
 324     function.
 325
 326     It returns the tuple (stream, definitive_file_name).
 327     """
 328     try:
 329         if filename == u'-':
 330             if sys.platform == 'win32':
 331                 import msvcrt
 332                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 333             return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
 334         stream = open(encodeFilename(filename), open_mode)
 335         return (stream, filename)
 336     except (IOError, OSError) as err:
 337         # In case of error, try to remove win32 forbidden chars
 338         filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 339
 340         # An exception here should be caught in the caller
 341         stream = open(encodeFilename(filename), open_mode)
 342         return (stream, filename)
 343
 344
 345 def timeconvert(timestr):
 346     """Convert RFC 2822 defined time string into system timestamp"""
 347     timestamp = None
 348     timetuple = email.utils.parsedate_tz(timestr)
 349     if timetuple is not None:
 350         timestamp = email.utils.mktime_tz(timetuple)
 351     return timestamp
 352
 353 def sanitize_filename(s, restricted=False, is_id=False):
 354     """Sanitizes a string so it could be used as part of a filename.
 355     If restricted is set, use a stricter subset of allowed characters.
 356     Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
 357     """
 358     def replace_insane(char):
 359         if char == '?' or ord(char) < 32 or ord(char) == 127:
 360             return ''
 361         elif char == '"':
 362             return '' if restricted else '\''
 363         elif char == ':':
 364             return '_-' if restricted else ' -'
 365         elif char in '\\/|*<>':
 366             return '_'
 367         if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
 368             return '_'
 369         if restricted and ord(char) > 127:
 370             return '_'
 371         return char
 372
 373     result = u''.join(map(replace_insane, s))
 374     if not is_id:
 375         while '__' in result:
 376             result = result.replace('__', '_')
 377         result = result.strip('_')
 378         # Common case of "Foreign band name - English song title"
 379         if restricted and result.startswith('-_'):
 380             result = result[2:]
 381         if not result:
 382             result = '_'
 383     return result
 384
 385 def orderedSet(iterable):
 386     """ Remove all duplicates from the input iterable """
 387     res = []
 388     for el in iterable:
 389         if el not in res:
 390             res.append(el)
 391     return res
 392
 393 def unescapeHTML(s):
 394     """
 395     @param s a string
 396     """
 397     assert type(s) == type(u'')
 398
 399     result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 400     return result
 401
 402 def encodeFilename(s):
 403     """
 404     @param s The name of the file
 405     """
 406
 407     assert type(s) == type(u'')
 408
 409     # Python 3 has a Unicode API
 410     if sys.version_info >= (3, 0):
 411         return s
 412
 413     if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 414         # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 415         # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 416         # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 417         return s
 418     else:
 419         encoding = sys.getfilesystemencoding()
 420         if encoding is None:
 421             encoding = 'utf-8'
 422         return s.encode(encoding, 'ignore')
 423
 424 def decodeOption(optval):
 425     if optval is None:
 426         return optval
 427     if isinstance(optval, bytes):
 428         optval = optval.decode(preferredencoding())
 429
 430     assert isinstance(optval, compat_str)
 431     return optval
 432
 433 def formatSeconds(secs):
 434     if secs > 3600:
 435         return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
 436     elif secs > 60:
 437         return '%d:%02d' % (secs // 60, secs % 60)
 438     else:
 439         return '%d' % secs
 440
 441 def make_HTTPS_handler(opts):
 442     if sys.version_info < (3,2):
 443         # Python's 2.x handler is very simplistic
 444         return compat_urllib_request.HTTPSHandler()
 445     else:
 446         import ssl
 447         context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
 448         context.set_default_verify_paths()
 449
 450         context.verify_mode = (ssl.CERT_NONE
 451                                if opts.no_check_certificate
 452                                else ssl.CERT_REQUIRED)
 453         return compat_urllib_request.HTTPSHandler(context=context)
 454
 455 class ExtractorError(Exception):
 456     """Error during info extraction."""
 457     def __init__(self, msg, tb=None):
 458         """ tb, if given, is the original traceback (so that it can be printed out). """
 459         super(ExtractorError, self).__init__(msg)
 460         self.traceback = tb
 461         self.exc_info = sys.exc_info()  # preserve original exception
 462
 463     def format_traceback(self):
 464         if self.traceback is None:
 465             return None
 466         return u''.join(traceback.format_tb(self.traceback))
 467
 468
 469 class DownloadError(Exception):
 470     """Download Error exception.
 471
 472     This exception may be thrown by FileDownloader objects if they are not
 473     configured to continue on errors. They will contain the appropriate
 474     error message.
 475     """
 476     def __init__(self, msg, exc_info=None):
 477         """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
 478         super(DownloadError, self).__init__(msg)
 479         self.exc_info = exc_info
 480
 481
 482 class SameFileError(Exception):
 483     """Same File exception.
 484
 485     This exception will be thrown by FileDownloader objects if they detect
 486     multiple files would have to be downloaded to the same file on disk.
 487     """
 488     pass
 489
 490
 491 class PostProcessingError(Exception):
 492     """Post Processing exception.
 493
 494     This exception may be raised by PostProcessor's .run() method to
 495     indicate an error in the postprocessing task.
 496     """
 497     def __init__(self, msg):
 498         self.msg = msg
 499
 500 class MaxDownloadsReached(Exception):
 501     """ --max-downloads limit has been reached. """
 502     pass
 503
 504
 505 class UnavailableVideoError(Exception):
 506     """Unavailable Format exception.
 507
 508     This exception will be thrown when a video is requested
 509     in a format that is not available for that video.
 510     """
 511     pass
 512
 513
 514 class ContentTooShortError(Exception):
 515     """Content Too Short exception.
 516
 517     This exception may be raised by FileDownloader objects when a file they
 518     download is too small for what the server announced first, indicating
 519     the connection was probably interrupted.
 520     """
 521     # Both in bytes
 522     downloaded = None
 523     expected = None
 524
 525     def __init__(self, downloaded, expected):
 526         self.downloaded = downloaded
 527         self.expected = expected
 528
 529 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 530     """Handler for HTTP requests and responses.
 531
 532     This class, when installed with an OpenerDirector, automatically adds
 533     the standard headers to every HTTP request and handles gzipped and
 534     deflated responses from web servers. If compression is to be avoided in
 535     a particular request, the original request in the program code only has
 536     to include the HTTP header "Youtubedl-No-Compression", which will be
 537     removed before making the real request.
 538
 539     Part of this code was copied from:
 540
 541     http://techknack.net/python-urllib2-handlers/
 542
 543     Andrew Rowls, the author of that code, agreed to release it to the
 544     public domain.
 545     """
 546
 547     @staticmethod
 548     def deflate(data):
 549         try:
 550             return zlib.decompress(data, -zlib.MAX_WBITS)
 551         except zlib.error:
 552             return zlib.decompress(data)
 553
 554     @staticmethod
 555     def addinfourl_wrapper(stream, headers, url, code):
 556         if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 557             return compat_urllib_request.addinfourl(stream, headers, url, code)
 558         ret = compat_urllib_request.addinfourl(stream, headers, url)
 559         ret.code = code
 560         return ret
 561
 562     def http_request(self, req):
 563         for h,v in std_headers.items():
 564             if h in req.headers:
 565                 del req.headers[h]
 566             req.add_header(h, v)
 567         if 'Youtubedl-no-compression' in req.headers:
 568             if 'Accept-encoding' in req.headers:
 569                 del req.headers['Accept-encoding']
 570             del req.headers['Youtubedl-no-compression']
 571         if 'Youtubedl-user-agent' in req.headers:
 572             if 'User-agent' in req.headers:
 573                 del req.headers['User-agent']
 574             req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
 575             del req.headers['Youtubedl-user-agent']
 576         return req
 577
 578     def http_response(self, req, resp):
 579         old_resp = resp
 580         # gzip
 581         if resp.headers.get('Content-encoding', '') == 'gzip':
 582             gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 583             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 584             resp.msg = old_resp.msg
 585         # deflate
 586         if resp.headers.get('Content-encoding', '') == 'deflate':
 587             gz = io.BytesIO(self.deflate(resp.read()))
 588             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 589             resp.msg = old_resp.msg
 590         return resp
 591
 592     https_request = http_request
 593     https_response = http_response
 594
 595 def unified_strdate(date_str):
 596     """Return a string with the date in the format YYYYMMDD"""
 597     upload_date = None
 598     #Replace commas
 599     date_str = date_str.replace(',',' ')
 600     # %z (UTC offset) is only supported in python>=3.2
 601     date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
 602     format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
 603     for expression in format_expressions:
 604         try:
 605             upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
 606         except:
 607             pass
 608     return upload_date
 609
 610 def date_from_str(date_str):
 611     """
 612     Return a datetime object from a string in the format YYYYMMDD or
 613     (now|today)[+-][0-9](day|week|month|year)(s)?"""
 614     today = datetime.date.today()
 615     if date_str == 'now'or date_str == 'today':
 616         return today
 617     match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
 618     if match is not None:
 619         sign = match.group('sign')
 620         time = int(match.group('time'))
 621         if sign == '-':
 622             time = -time
 623         unit = match.group('unit')
 624         #A bad aproximation?
 625         if unit == 'month':
 626             unit = 'day'
 627             time *= 30
 628         elif unit == 'year':
 629             unit = 'day'
 630             time *= 365
 631         unit += 's'
 632         delta = datetime.timedelta(**{unit: time})
 633         return today + delta
 634     return datetime.datetime.strptime(date_str, "%Y%m%d").date()
 635
 636 class DateRange(object):
 637     """Represents a time interval between two dates"""
 638     def __init__(self, start=None, end=None):
 639         """start and end must be strings in the format accepted by date"""
 640         if start is not None:
 641             self.start = date_from_str(start)
 642         else:
 643             self.start = datetime.datetime.min.date()
 644         if end is not None:
 645             self.end = date_from_str(end)
 646         else:
 647             self.end = datetime.datetime.max.date()
 648         if self.start > self.end:
 649             raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
 650     @classmethod
 651     def day(cls, day):
 652         """Returns a range that only contains the given day"""
 653         return cls(day,day)
 654     def __contains__(self, date):
 655         """Check if the date is in the range"""
 656         if not isinstance(date, datetime.date):
 657             date = date_from_str(date)
 658         return self.start <= date <= self.end
 659     def __str__(self):
 660         return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())