youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import io
   6 import locale
   7 import os
   8 import re
   9 import sys
  10 import zlib
  11 import email.utils
  12 import json
  13
  14 try:
  15         import urllib.request as compat_urllib_request
  16 except ImportError: # Python 2
  17         import urllib2 as compat_urllib_request
  18
  19 try:
  20         import urllib.error as compat_urllib_error
  21 except ImportError: # Python 2
  22         import urllib2 as compat_urllib_error
  23
  24 try:
  25         import urllib.parse as compat_urllib_parse
  26 except ImportError: # Python 2
  27         import urllib as compat_urllib_parse
  28
  29 try:
  30         import http.cookiejar as compat_cookiejar
  31 except ImportError: # Python 2
  32         import cookielib as compat_cookiejar
  33
  34 try:
  35         import html.entities as compat_html_entities
  36 except NameError: # Python 2
  37         import htmlentitydefs as compat_html_entities
  38
  39 try:
  40         import html.parser as compat_html_parser
  41 except NameError: # Python 2
  42         import HTMLParser as compat_html_parser
  43
  44 try:
  45         compat_str = unicode # Python 2
  46 except NameError:
  47         compat_str = str
  48
  49 try:
  50         compat_chr = unichr # Python 2
  51 except NameError:
  52         compat_chr = chr
  53
  54
  55 std_headers = {
  56         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  57         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  58         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  59         'Accept-Encoding': 'gzip, deflate',
  60         'Accept-Language': 'en-us,en;q=0.5',
  61 }
  62 def preferredencoding():
  63         """Get preferred encoding.
  64
  65         Returns the best encoding scheme for the system, based on
  66         locale.getpreferredencoding() and some further tweaks.
  67         """
  68         try:
  69                 pref = locale.getpreferredencoding()
  70                 u'TEST'.encode(pref)
  71         except:
  72                 pref = 'UTF-8'
  73
  74         return pref
  75
  76
  77 def htmlentity_transform(matchobj):
  78         """Transforms an HTML entity to a character.
  79
  80         This function receives a match object and is intended to be used with
  81         the re.sub() function.
  82         """
  83         entity = matchobj.group(1)
  84
  85         # Known non-numeric HTML entity
  86         if entity in compat_html_entities.name2codepoint:
  87                 return compat_chr(compat_html_entities.name2codepoint[entity])
  88
  89         mobj = re.match(u'(?u)#(x?\\d+)', entity)
  90         if mobj is not None:
  91                 numstr = mobj.group(1)
  92                 if numstr.startswith(u'x'):
  93                         base = 16
  94                         numstr = u'0%s' % numstr
  95                 else:
  96                         base = 10
  97                 return compat_chr(int(numstr, base))
  98
  99         # Unknown entity in name, return its literal representation
 100         return (u'&%s;' % entity)
 101
 102 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
 103 class IDParser(compat_html_parser.HTMLParser):
 104         """Modified HTMLParser that isolates a tag with the specified id"""
 105         def __init__(self, id):
 106                 self.id = id
 107                 self.result = None
 108                 self.started = False
 109                 self.depth = {}
 110                 self.html = None
 111                 self.watch_startpos = False
 112                 self.error_count = 0
 113                 compat_html_parser.HTMLParser.__init__(self)
 114
 115         def error(self, message):
 116                 if self.error_count > 10 or self.started:
 117                         raise compat_html_parser.HTMLParseError(message, self.getpos())
 118                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
 119                 self.error_count += 1
 120                 self.goahead(1)
 121
 122         def loads(self, html):
 123                 self.html = html
 124                 self.feed(html)
 125                 self.close()
 126
 127         def handle_starttag(self, tag, attrs):
 128                 attrs = dict(attrs)
 129                 if self.started:
 130                         self.find_startpos(None)
 131                 if 'id' in attrs and attrs['id'] == self.id:
 132                         self.result = [tag]
 133                         self.started = True
 134                         self.watch_startpos = True
 135                 if self.started:
 136                         if not tag in self.depth: self.depth[tag] = 0
 137                         self.depth[tag] += 1
 138
 139         def handle_endtag(self, tag):
 140                 if self.started:
 141                         if tag in self.depth: self.depth[tag] -= 1
 142                         if self.depth[self.result[0]] == 0:
 143                                 self.started = False
 144                                 self.result.append(self.getpos())
 145
 146         def find_startpos(self, x):
 147                 """Needed to put the start position of the result (self.result[1])
 148                 after the opening tag with the requested id"""
 149                 if self.watch_startpos:
 150                         self.watch_startpos = False
 151                         self.result.append(self.getpos())
 152         handle_entityref = handle_charref = handle_data = handle_comment = \
 153         handle_decl = handle_pi = unknown_decl = find_startpos
 154
 155         def get_result(self):
 156                 if self.result is None:
 157                         return None
 158                 if len(self.result) != 3:
 159                         return None
 160                 lines = self.html.split('\n')
 161                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 162                 lines[0] = lines[0][self.result[1][1]:]
 163                 if len(lines) == 1:
 164                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 165                 lines[-1] = lines[-1][:self.result[2][1]]
 166                 return '\n'.join(lines).strip()
 167
 168 def get_element_by_id(id, html):
 169         """Return the content of the tag with the specified id in the passed HTML document"""
 170         parser = IDParser(id)
 171         try:
 172                 parser.loads(html)
 173         except compat_html_parser.HTMLParseError:
 174                 pass
 175         return parser.get_result()
 176
 177
 178 def clean_html(html):
 179         """Clean an HTML snippet into a readable string"""
 180         # Newline vs <br />
 181         html = html.replace('\n', ' ')
 182         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 183         # Strip html tags
 184         html = re.sub('<.*?>', '', html)
 185         # Replace html entities
 186         html = unescapeHTML(html)
 187         return html
 188
 189
 190 def sanitize_open(filename, open_mode):
 191         """Try to open the given filename, and slightly tweak it if this fails.
 192
 193         Attempts to open the given filename. If this fails, it tries to change
 194         the filename slightly, step by step, until it's either able to open it
 195         or it fails and raises a final exception, like the standard open()
 196         function.
 197
 198         It returns the tuple (stream, definitive_file_name).
 199         """
 200         try:
 201                 if filename == u'-':
 202                         if sys.platform == 'win32':
 203                                 import msvcrt
 204                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 205                         return (sys.stdout, filename)
 206                 stream = open(encodeFilename(filename), open_mode)
 207                 return (stream, filename)
 208         except (IOError, OSError) as err:
 209                 # In case of error, try to remove win32 forbidden chars
 210                 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
 211
 212                 # An exception here should be caught in the caller
 213                 stream = open(encodeFilename(filename), open_mode)
 214                 return (stream, filename)
 215
 216
 217 def timeconvert(timestr):
 218         """Convert RFC 2822 defined time string into system timestamp"""
 219         timestamp = None
 220         timetuple = email.utils.parsedate_tz(timestr)
 221         if timetuple is not None:
 222                 timestamp = email.utils.mktime_tz(timetuple)
 223         return timestamp
 224
 225 def sanitize_filename(s, restricted=False):
 226         """Sanitizes a string so it could be used as part of a filename.
 227         If restricted is set, use a stricter subset of allowed characters.
 228         """
 229         def replace_insane(char):
 230                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 231                         return ''
 232                 elif char == '"':
 233                         return '' if restricted else '\''
 234                 elif char == ':':
 235                         return '_-' if restricted else ' -'
 236                 elif char in '\\/|*<>':
 237                         return '_'
 238                 if restricted and (char in '!&\'' or char.isspace()):
 239                         return '_'
 240                 if restricted and ord(char) > 127:
 241                         return '_'
 242                 return char
 243
 244         result = u''.join(map(replace_insane, s))
 245         while '__' in result:
 246                 result = result.replace('__', '_')
 247         result = result.strip('_')
 248         # Common case of "Foreign band name - English song title"
 249         if restricted and result.startswith('-_'):
 250                 result = result[2:]
 251         if not result:
 252                 result = '_'
 253         return result
 254
 255 def orderedSet(iterable):
 256         """ Remove all duplicates from the input iterable """
 257         res = []
 258         for el in iterable:
 259                 if el not in res:
 260                         res.append(el)
 261         return res
 262
 263 def unescapeHTML(s):
 264         """
 265         @param s a string
 266         """
 267         assert type(s) == type(u'')
 268
 269         result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
 270         return result
 271
 272 def encodeFilename(s):
 273         """
 274         @param s The name of the file
 275         """
 276
 277         assert type(s) == type(u'')
 278
 279         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 280                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 281                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 282                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 283                 return s
 284         else:
 285                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 286
 287 class DownloadError(Exception):
 288         """Download Error exception.
 289
 290         This exception may be thrown by FileDownloader objects if they are not
 291         configured to continue on errors. They will contain the appropriate
 292         error message.
 293         """
 294         pass
 295
 296
 297 class SameFileError(Exception):
 298         """Same File exception.
 299
 300         This exception will be thrown by FileDownloader objects if they detect
 301         multiple files would have to be downloaded to the same file on disk.
 302         """
 303         pass
 304
 305
 306 class PostProcessingError(Exception):
 307         """Post Processing exception.
 308
 309         This exception may be raised by PostProcessor's .run() method to
 310         indicate an error in the postprocessing task.
 311         """
 312         pass
 313
 314 class MaxDownloadsReached(Exception):
 315         """ --max-downloads limit has been reached. """
 316         pass
 317
 318
 319 class UnavailableVideoError(Exception):
 320         """Unavailable Format exception.
 321
 322         This exception will be thrown when a video is requested
 323         in a format that is not available for that video.
 324         """
 325         pass
 326
 327
 328 class ContentTooShortError(Exception):
 329         """Content Too Short exception.
 330
 331         This exception may be raised by FileDownloader objects when a file they
 332         download is too small for what the server announced first, indicating
 333         the connection was probably interrupted.
 334         """
 335         # Both in bytes
 336         downloaded = None
 337         expected = None
 338
 339         def __init__(self, downloaded, expected):
 340                 self.downloaded = downloaded
 341                 self.expected = expected
 342
 343
 344 class Trouble(Exception):
 345         """Trouble helper exception
 346
 347         This is an exception to be handled with
 348         FileDownloader.trouble
 349         """
 350
 351 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
 352         """Handler for HTTP requests and responses.
 353
 354         This class, when installed with an OpenerDirector, automatically adds
 355         the standard headers to every HTTP request and handles gzipped and
 356         deflated responses from web servers. If compression is to be avoided in
 357         a particular request, the original request in the program code only has
 358         to include the HTTP header "Youtubedl-No-Compression", which will be
 359         removed before making the real request.
 360
 361         Part of this code was copied from:
 362
 363         http://techknack.net/python-urllib2-handlers/
 364
 365         Andrew Rowls, the author of that code, agreed to release it to the
 366         public domain.
 367         """
 368
 369         @staticmethod
 370         def deflate(data):
 371                 try:
 372                         return zlib.decompress(data, -zlib.MAX_WBITS)
 373                 except zlib.error:
 374                         return zlib.decompress(data)
 375
 376         @staticmethod
 377         def addinfourl_wrapper(stream, headers, url, code):
 378                 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
 379                         return compat_urllib_request.addinfourl(stream, headers, url, code)
 380                 ret = compat_urllib_request.addinfourl(stream, headers, url)
 381                 ret.code = code
 382                 return ret
 383
 384         def http_request(self, req):
 385                 for h in std_headers:
 386                         if h in req.headers:
 387                                 del req.headers[h]
 388                         req.add_header(h, std_headers[h])
 389                 if 'Youtubedl-no-compression' in req.headers:
 390                         if 'Accept-encoding' in req.headers:
 391                                 del req.headers['Accept-encoding']
 392                         del req.headers['Youtubedl-no-compression']
 393                 return req
 394
 395         def http_response(self, req, resp):
 396                 old_resp = resp
 397                 # gzip
 398                 if resp.headers.get('Content-encoding', '') == 'gzip':
 399                         gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
 400                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 401                         resp.msg = old_resp.msg
 402                 # deflate
 403                 if resp.headers.get('Content-encoding', '') == 'deflate':
 404                         gz = io.BytesIO(self.deflate(resp.read()))
 405                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 406                         resp.msg = old_resp.msg
 407                 return resp