youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14 import json
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  25         'Accept-Encoding': 'gzip, deflate',
  26         'Accept-Language': 'en-us,en;q=0.5',
  27 }
  28
  29 def preferredencoding():
  30         """Get preferred encoding.
  31
  32         Returns the best encoding scheme for the system, based on
  33         locale.getpreferredencoding() and some further tweaks.
  34         """
  35         def yield_preferredencoding():
  36                 try:
  37                         pref = locale.getpreferredencoding()
  38                         u'TEST'.encode(pref)
  39                 except:
  40                         pref = 'UTF-8'
  41                 while True:
  42                         yield pref
  43         return yield_preferredencoding().next()
  44
  45
  46 def htmlentity_transform(matchobj):
  47         """Transforms an HTML entity to a Unicode character.
  48
  49         This function receives a match object and is intended to be used with
  50         the re.sub() function.
  51         """
  52         entity = matchobj.group(1)
  53
  54         # Known non-numeric HTML entity
  55         if entity in htmlentitydefs.name2codepoint:
  56                 return unichr(htmlentitydefs.name2codepoint[entity])
  57
  58         # Unicode character
  59         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  60         if mobj is not None:
  61                 numstr = mobj.group(1)
  62                 if numstr.startswith(u'x'):
  63                         base = 16
  64                         numstr = u'0%s' % numstr
  65                 else:
  66                         base = 10
  67                 return unichr(long(numstr, base))
  68
  69         # Unknown entity in name, return its literal representation
  70         return (u'&%s;' % entity)
  71
  72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  73 class IDParser(HTMLParser.HTMLParser):
  74         """Modified HTMLParser that isolates a tag with the specified id"""
  75         def __init__(self, id):
  76                 self.id = id
  77                 self.result = None
  78                 self.started = False
  79                 self.depth = {}
  80                 self.html = None
  81                 self.watch_startpos = False
  82                 self.error_count = 0
  83                 HTMLParser.HTMLParser.__init__(self)
  84
  85         def error(self, message):
  86                 if self.error_count > 10 or self.started:
  87                         raise HTMLParser.HTMLParseError(message, self.getpos())
  88                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  89                 self.error_count += 1
  90                 self.goahead(1)
  91
  92         def loads(self, html):
  93                 self.html = html
  94                 self.feed(html)
  95                 self.close()
  96
  97         def handle_starttag(self, tag, attrs):
  98                 attrs = dict(attrs)
  99                 if self.started:
 100                         self.find_startpos(None)
 101                 if 'id' in attrs and attrs['id'] == self.id:
 102                         self.result = [tag]
 103                         self.started = True
 104                         self.watch_startpos = True
 105                 if self.started:
 106                         if not tag in self.depth: self.depth[tag] = 0
 107                         self.depth[tag] += 1
 108
 109         def handle_endtag(self, tag):
 110                 if self.started:
 111                         if tag in self.depth: self.depth[tag] -= 1
 112                         if self.depth[self.result[0]] == 0:
 113                                 self.started = False
 114                                 self.result.append(self.getpos())
 115
 116         def find_startpos(self, x):
 117                 """Needed to put the start position of the result (self.result[1])
 118                 after the opening tag with the requested id"""
 119                 if self.watch_startpos:
 120                         self.watch_startpos = False
 121                         self.result.append(self.getpos())
 122         handle_entityref = handle_charref = handle_data = handle_comment = \
 123         handle_decl = handle_pi = unknown_decl = find_startpos
 124
 125         def get_result(self):
 126                 if self.result == None: return None
 127                 if len(self.result) != 3: return None
 128                 lines = self.html.split('\n')
 129                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 130                 lines[0] = lines[0][self.result[1][1]:]
 131                 if len(lines) == 1:
 132                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 133                 lines[-1] = lines[-1][:self.result[2][1]]
 134                 return '\n'.join(lines).strip()
 135
 136 def get_element_by_id(id, html):
 137         """Return the content of the tag with the specified id in the passed HTML document"""
 138         parser = IDParser(id)
 139         try:
 140                 parser.loads(html)
 141         except HTMLParser.HTMLParseError:
 142                 pass
 143         return parser.get_result()
 144
 145
 146 def clean_html(html):
 147         """Clean an HTML snippet into a readable string"""
 148         # Newline vs <br />
 149         html = html.replace('\n', ' ')
 150         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 151         # Strip html tags
 152         html = re.sub('<.*?>', '', html)
 153         # Replace html entities
 154         html = unescapeHTML(html)
 155         return html
 156
 157
 158 def sanitize_open(filename, open_mode):
 159         """Try to open the given filename, and slightly tweak it if this fails.
 160
 161         Attempts to open the given filename. If this fails, it tries to change
 162         the filename slightly, step by step, until it's either able to open it
 163         or it fails and raises a final exception, like the standard open()
 164         function.
 165
 166         It returns the tuple (stream, definitive_file_name).
 167         """
 168         try:
 169                 if filename == u'-':
 170                         if sys.platform == 'win32':
 171                                 import msvcrt
 172                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 173                         return (sys.stdout, filename)
 174                 stream = open(encodeFilename(filename), open_mode)
 175                 return (stream, filename)
 176         except (IOError, OSError), err:
 177                 # In case of error, try to remove win32 forbidden chars
 178                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 179
 180                 # An exception here should be caught in the caller
 181                 stream = open(encodeFilename(filename), open_mode)
 182                 return (stream, filename)
 183
 184
 185 def timeconvert(timestr):
 186         """Convert RFC 2822 defined time string into system timestamp"""
 187         timestamp = None
 188         timetuple = email.utils.parsedate_tz(timestr)
 189         if timetuple is not None:
 190                 timestamp = email.utils.mktime_tz(timetuple)
 191         return timestamp
 192
 193 def sanitize_filename(s):
 194         """Sanitizes a string so it could be used as part of a filename."""
 195         def replace_insane(char):
 196                 if char == '?' or ord(char) < 32 or ord(char) == 127:
 197                         return ''
 198                 elif char == '"':
 199                         return '\''
 200                 elif char == ':':
 201                         return ' -'
 202                 elif char in '\\/|*<>':
 203                         return '-'
 204                 return char
 205
 206         result = u''.join(map(replace_insane, s))
 207         while '--' in result:
 208                 result = result.replace('--', '-')
 209         return result.strip('-')
 210
 211 def orderedSet(iterable):
 212         """ Remove all duplicates from the input iterable """
 213         res = []
 214         for el in iterable:
 215                 if el not in res:
 216                         res.append(el)
 217         return res
 218
 219 def unescapeHTML(s):
 220         """
 221         @param s a string (of type unicode)
 222         """
 223         assert type(s) == type(u'')
 224
 225         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 226         return result
 227
 228 def encodeFilename(s):
 229         """
 230         @param s The name of the file (of type unicode)
 231         """
 232
 233         assert type(s) == type(u'')
 234
 235         if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 236                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 237                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 238                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 239                 return s
 240         else:
 241                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 242
 243 class DownloadError(Exception):
 244         """Download Error exception.
 245
 246         This exception may be thrown by FileDownloader objects if they are not
 247         configured to continue on errors. They will contain the appropriate
 248         error message.
 249         """
 250         pass
 251
 252
 253 class SameFileError(Exception):
 254         """Same File exception.
 255
 256         This exception will be thrown by FileDownloader objects if they detect
 257         multiple files would have to be downloaded to the same file on disk.
 258         """
 259         pass
 260
 261
 262 class PostProcessingError(Exception):
 263         """Post Processing exception.
 264
 265         This exception may be raised by PostProcessor's .run() method to
 266         indicate an error in the postprocessing task.
 267         """
 268         pass
 269
 270 class MaxDownloadsReached(Exception):
 271         """ --max-downloads limit has been reached. """
 272         pass
 273
 274
 275 class UnavailableVideoError(Exception):
 276         """Unavailable Format exception.
 277
 278         This exception will be thrown when a video is requested
 279         in a format that is not available for that video.
 280         """
 281         pass
 282
 283
 284 class ContentTooShortError(Exception):
 285         """Content Too Short exception.
 286
 287         This exception may be raised by FileDownloader objects when a file they
 288         download is too small for what the server announced first, indicating
 289         the connection was probably interrupted.
 290         """
 291         # Both in bytes
 292         downloaded = None
 293         expected = None
 294
 295         def __init__(self, downloaded, expected):
 296                 self.downloaded = downloaded
 297                 self.expected = expected
 298
 299
 300 class Trouble(Exception):
 301         """Trouble helper exception
 302
 303         This is an exception to be handled with
 304         FileDownloader.trouble
 305         """
 306
 307 class YoutubeDLHandler(urllib2.HTTPHandler):
 308         """Handler for HTTP requests and responses.
 309
 310         This class, when installed with an OpenerDirector, automatically adds
 311         the standard headers to every HTTP request and handles gzipped and
 312         deflated responses from web servers. If compression is to be avoided in
 313         a particular request, the original request in the program code only has
 314         to include the HTTP header "Youtubedl-No-Compression", which will be
 315         removed before making the real request.
 316
 317         Part of this code was copied from:
 318
 319         http://techknack.net/python-urllib2-handlers/
 320
 321         Andrew Rowls, the author of that code, agreed to release it to the
 322         public domain.
 323         """
 324
 325         @staticmethod
 326         def deflate(data):
 327                 try:
 328                         return zlib.decompress(data, -zlib.MAX_WBITS)
 329                 except zlib.error:
 330                         return zlib.decompress(data)
 331
 332         @staticmethod
 333         def addinfourl_wrapper(stream, headers, url, code):
 334                 if hasattr(urllib2.addinfourl, 'getcode'):
 335                         return urllib2.addinfourl(stream, headers, url, code)
 336                 ret = urllib2.addinfourl(stream, headers, url)
 337                 ret.code = code
 338                 return ret
 339
 340         def http_request(self, req):
 341                 for h in std_headers:
 342                         if h in req.headers:
 343                                 del req.headers[h]
 344                         req.add_header(h, std_headers[h])
 345                 if 'Youtubedl-no-compression' in req.headers:
 346                         if 'Accept-encoding' in req.headers:
 347                                 del req.headers['Accept-encoding']
 348                         del req.headers['Youtubedl-no-compression']
 349                 return req
 350
 351         def http_response(self, req, resp):
 352                 old_resp = resp
 353                 # gzip
 354                 if resp.headers.get('Content-encoding', '') == 'gzip':
 355                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 356                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 357                         resp.msg = old_resp.msg
 358                 # deflate
 359                 if resp.headers.get('Content-encoding', '') == 'deflate':
 360                         gz = StringIO.StringIO(self.deflate(resp.read()))
 361                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 362                         resp.msg = old_resp.msg
 363                 return resp