youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14
  15 try:
  16         import cStringIO as StringIO
  17 except ImportError:
  18         import StringIO
  19
  20 try:
  21         import json
  22 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  23         import trivialjson as json
  24
  25 std_headers = {
  26         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  27         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  28         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  29         'Accept-Encoding': 'gzip, deflate',
  30         'Accept-Language': 'en-us,en;q=0.5',
  31 }
  32
  33 def preferredencoding():
  34         """Get preferred encoding.
  35
  36         Returns the best encoding scheme for the system, based on
  37         locale.getpreferredencoding() and some further tweaks.
  38         """
  39         def yield_preferredencoding():
  40                 try:
  41                         pref = locale.getpreferredencoding()
  42                         u'TEST'.encode(pref)
  43                 except:
  44                         pref = 'UTF-8'
  45                 while True:
  46                         yield pref
  47         return yield_preferredencoding().next()
  48
  49
  50 def htmlentity_transform(matchobj):
  51         """Transforms an HTML entity to a Unicode character.
  52
  53         This function receives a match object and is intended to be used with
  54         the re.sub() function.
  55         """
  56         entity = matchobj.group(1)
  57
  58         # Known non-numeric HTML entity
  59         if entity in htmlentitydefs.name2codepoint:
  60                 return unichr(htmlentitydefs.name2codepoint[entity])
  61
  62         # Unicode character
  63         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  64         if mobj is not None:
  65                 numstr = mobj.group(1)
  66                 if numstr.startswith(u'x'):
  67                         base = 16
  68                         numstr = u'0%s' % numstr
  69                 else:
  70                         base = 10
  71                 return unichr(long(numstr, base))
  72
  73         # Unknown entity in name, return its literal representation
  74         return (u'&%s;' % entity)
  75
  76 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  77 class IDParser(HTMLParser.HTMLParser):
  78         """Modified HTMLParser that isolates a tag with the specified id"""
  79         def __init__(self, id):
  80                 self.id = id
  81                 self.result = None
  82                 self.started = False
  83                 self.depth = {}
  84                 self.html = None
  85                 self.watch_startpos = False
  86                 self.error_count = 0
  87                 HTMLParser.HTMLParser.__init__(self)
  88
  89         def error(self, message):
  90                 print self.getpos()
  91                 if self.error_count > 10 or self.started:
  92                         raise HTMLParser.HTMLParseError(message, self.getpos())
  93                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  94                 self.error_count += 1
  95                 self.goahead(1)
  96
  97         def loads(self, html):
  98                 self.html = html
  99                 self.feed(html)
 100                 self.close()
 101
 102         def handle_starttag(self, tag, attrs):
 103                 attrs = dict(attrs)
 104                 if self.started:
 105                         self.find_startpos(None)
 106                 if 'id' in attrs and attrs['id'] == self.id:
 107                         self.result = [tag]
 108                         self.started = True
 109                         self.watch_startpos = True
 110                 if self.started:
 111                         if not tag in self.depth: self.depth[tag] = 0
 112                         self.depth[tag] += 1
 113
 114         def handle_endtag(self, tag):
 115                 if self.started:
 116                         if tag in self.depth: self.depth[tag] -= 1
 117                         if self.depth[self.result[0]] == 0:
 118                                 self.started = False
 119                                 self.result.append(self.getpos())
 120
 121         def find_startpos(self, x):
 122                 """Needed to put the start position of the result (self.result[1])
 123                 after the opening tag with the requested id"""
 124                 if self.watch_startpos:
 125                         self.watch_startpos = False
 126                         self.result.append(self.getpos())
 127         handle_entityref = handle_charref = handle_data = handle_comment = \
 128         handle_decl = handle_pi = unknown_decl = find_startpos
 129
 130         def get_result(self):
 131                 if self.result == None: return None
 132                 if len(self.result) != 3: return None
 133                 lines = self.html.split('\n')
 134                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 135                 lines[0] = lines[0][self.result[1][1]:]
 136                 if len(lines) == 1:
 137                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 138                 lines[-1] = lines[-1][:self.result[2][1]]
 139                 return '\n'.join(lines).strip()
 140
 141 def get_element_by_id(id, html):
 142         """Return the content of the tag with the specified id in the passed HTML document"""
 143         parser = IDParser(id)
 144         try:
 145                 parser.loads(html)
 146         except HTMLParser.HTMLParseError:
 147                 pass
 148         return parser.get_result()
 149
 150
 151 def clean_html(html):
 152         """Clean an HTML snippet into a readable string"""
 153         # Newline vs <br />
 154         html = html.replace('\n', ' ')
 155         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 156         # Strip html tags
 157         html = re.sub('<.*?>', '', html)
 158         # Replace html entities
 159         html = unescapeHTML(html)
 160         return html
 161
 162
 163 def sanitize_title(utitle):
 164         """Sanitizes a video title so it could be used as part of a filename."""
 165         utitle = unescapeHTML(utitle)
 166         return utitle.replace(unicode(os.sep), u'%')
 167
 168
 169 def sanitize_open(filename, open_mode):
 170         """Try to open the given filename, and slightly tweak it if this fails.
 171
 172         Attempts to open the given filename. If this fails, it tries to change
 173         the filename slightly, step by step, until it's either able to open it
 174         or it fails and raises a final exception, like the standard open()
 175         function.
 176
 177         It returns the tuple (stream, definitive_file_name).
 178         """
 179         try:
 180                 if filename == u'-':
 181                         if sys.platform == 'win32':
 182                                 import msvcrt
 183                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 184                         return (sys.stdout, filename)
 185                 stream = open(encodeFilename(filename), open_mode)
 186                 return (stream, filename)
 187         except (IOError, OSError), err:
 188                 # In case of error, try to remove win32 forbidden chars
 189                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 190
 191                 # An exception here should be caught in the caller
 192                 stream = open(encodeFilename(filename), open_mode)
 193                 return (stream, filename)
 194
 195
 196 def timeconvert(timestr):
 197         """Convert RFC 2822 defined time string into system timestamp"""
 198         timestamp = None
 199         timetuple = email.utils.parsedate_tz(timestr)
 200         if timetuple is not None:
 201                 timestamp = email.utils.mktime_tz(timetuple)
 202         return timestamp
 203
 204 def simplify_title(title):
 205         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 206         return expr.sub(u'_', title).strip(u'_')
 207
 208 def orderedSet(iterable):
 209         """ Remove all duplicates from the input iterable """
 210         res = []
 211         for el in iterable:
 212                 if el not in res:
 213                         res.append(el)
 214         return res
 215
 216 def unescapeHTML(s):
 217         """
 218         @param s a string (of type unicode)
 219         """
 220         assert type(s) == type(u'')
 221
 222         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 223         return result
 224
 225 def encodeFilename(s):
 226         """
 227         @param s The name of the file (of type unicode)
 228         """
 229
 230         assert type(s) == type(u'')
 231
 232         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 233                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 234                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 235                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 236                 return s
 237         else:
 238                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 239
 240 class DownloadError(Exception):
 241         """Download Error exception.
 242
 243         This exception may be thrown by FileDownloader objects if they are not
 244         configured to continue on errors. They will contain the appropriate
 245         error message.
 246         """
 247         pass
 248
 249
 250 class SameFileError(Exception):
 251         """Same File exception.
 252
 253         This exception will be thrown by FileDownloader objects if they detect
 254         multiple files would have to be downloaded to the same file on disk.
 255         """
 256         pass
 257
 258
 259 class PostProcessingError(Exception):
 260         """Post Processing exception.
 261
 262         This exception may be raised by PostProcessor's .run() method to
 263         indicate an error in the postprocessing task.
 264         """
 265         pass
 266
 267 class MaxDownloadsReached(Exception):
 268         """ --max-downloads limit has been reached. """
 269         pass
 270
 271
 272 class UnavailableVideoError(Exception):
 273         """Unavailable Format exception.
 274
 275         This exception will be thrown when a video is requested
 276         in a format that is not available for that video.
 277         """
 278         pass
 279
 280
 281 class ContentTooShortError(Exception):
 282         """Content Too Short exception.
 283
 284         This exception may be raised by FileDownloader objects when a file they
 285         download is too small for what the server announced first, indicating
 286         the connection was probably interrupted.
 287         """
 288         # Both in bytes
 289         downloaded = None
 290         expected = None
 291
 292         def __init__(self, downloaded, expected):
 293                 self.downloaded = downloaded
 294                 self.expected = expected
 295
 296
 297 class YoutubeDLHandler(urllib2.HTTPHandler):
 298         """Handler for HTTP requests and responses.
 299
 300         This class, when installed with an OpenerDirector, automatically adds
 301         the standard headers to every HTTP request and handles gzipped and
 302         deflated responses from web servers. If compression is to be avoided in
 303         a particular request, the original request in the program code only has
 304         to include the HTTP header "Youtubedl-No-Compression", which will be
 305         removed before making the real request.
 306
 307         Part of this code was copied from:
 308
 309         http://techknack.net/python-urllib2-handlers/
 310
 311         Andrew Rowls, the author of that code, agreed to release it to the
 312         public domain.
 313         """
 314
 315         @staticmethod
 316         def deflate(data):
 317                 try:
 318                         return zlib.decompress(data, -zlib.MAX_WBITS)
 319                 except zlib.error:
 320                         return zlib.decompress(data)
 321
 322         @staticmethod
 323         def addinfourl_wrapper(stream, headers, url, code):
 324                 if hasattr(urllib2.addinfourl, 'getcode'):
 325                         return urllib2.addinfourl(stream, headers, url, code)
 326                 ret = urllib2.addinfourl(stream, headers, url)
 327                 ret.code = code
 328                 return ret
 329
 330         def http_request(self, req):
 331                 for h in std_headers:
 332                         if h in req.headers:
 333                                 del req.headers[h]
 334                         req.add_header(h, std_headers[h])
 335                 if 'Youtubedl-no-compression' in req.headers:
 336                         if 'Accept-encoding' in req.headers:
 337                                 del req.headers['Accept-encoding']
 338                         del req.headers['Youtubedl-no-compression']
 339                 return req
 340
 341         def http_response(self, req, resp):
 342                 old_resp = resp
 343                 # gzip
 344                 if resp.headers.get('Content-encoding', '') == 'gzip':
 345                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 346                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 347                         resp.msg = old_resp.msg
 348                 # deflate
 349                 if resp.headers.get('Content-encoding', '') == 'deflate':
 350                         gz = StringIO.StringIO(self.deflate(resp.read()))
 351                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 352                         resp.msg = old_resp.msg
 353                 return resp