youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14 import json
  15
  16 try:
  17         import cStringIO as StringIO
  18 except ImportError:
  19         import StringIO
  20
  21 std_headers = {
  22         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  23         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  24         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  25         'Accept-Encoding': 'gzip, deflate',
  26         'Accept-Language': 'en-us,en;q=0.5',
  27 }
  28
  29 def preferredencoding():
  30         """Get preferred encoding.
  31
  32         Returns the best encoding scheme for the system, based on
  33         locale.getpreferredencoding() and some further tweaks.
  34         """
  35         try:
  36                 pref = locale.getpreferredencoding()
  37                 u'TEST'.encode(pref)
  38         except:
  39                 pref = 'UTF-8'
  40
  41         return pref
  42
  43
  44 def htmlentity_transform(matchobj):
  45         """Transforms an HTML entity to a Unicode character.
  46
  47         This function receives a match object and is intended to be used with
  48         the re.sub() function.
  49         """
  50         entity = matchobj.group(1)
  51
  52         # Known non-numeric HTML entity
  53         if entity in htmlentitydefs.name2codepoint:
  54                 return unichr(htmlentitydefs.name2codepoint[entity])
  55
  56         # Unicode character
  57         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  58         if mobj is not None:
  59                 numstr = mobj.group(1)
  60                 if numstr.startswith(u'x'):
  61                         base = 16
  62                         numstr = u'0%s' % numstr
  63                 else:
  64                         base = 10
  65                 return unichr(long(numstr, base))
  66
  67         # Unknown entity in name, return its literal representation
  68         return (u'&%s;' % entity)
  69
  70 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
  71 class IDParser(HTMLParser.HTMLParser):
  72         """Modified HTMLParser that isolates a tag with the specified id"""
  73         def __init__(self, id):
  74                 self.id = id
  75                 self.result = None
  76                 self.started = False
  77                 self.depth = {}
  78                 self.html = None
  79                 self.watch_startpos = False
  80                 self.error_count = 0
  81                 HTMLParser.HTMLParser.__init__(self)
  82
  83         def error(self, message):
  84                 print >> sys.stderr, self.getpos()
  85                 if self.error_count > 10 or self.started:
  86                         raise HTMLParser.HTMLParseError(message, self.getpos())
  87                 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
  88                 self.error_count += 1
  89                 self.goahead(1)
  90
  91         def loads(self, html):
  92                 self.html = html
  93                 self.feed(html)
  94                 self.close()
  95
  96         def handle_starttag(self, tag, attrs):
  97                 attrs = dict(attrs)
  98                 if self.started:
  99                         self.find_startpos(None)
 100                 if 'id' in attrs and attrs['id'] == self.id:
 101                         self.result = [tag]
 102                         self.started = True
 103                         self.watch_startpos = True
 104                 if self.started:
 105                         if not tag in self.depth: self.depth[tag] = 0
 106                         self.depth[tag] += 1
 107
 108         def handle_endtag(self, tag):
 109                 if self.started:
 110                         if tag in self.depth: self.depth[tag] -= 1
 111                         if self.depth[self.result[0]] == 0:
 112                                 self.started = False
 113                                 self.result.append(self.getpos())
 114
 115         def find_startpos(self, x):
 116                 """Needed to put the start position of the result (self.result[1])
 117                 after the opening tag with the requested id"""
 118                 if self.watch_startpos:
 119                         self.watch_startpos = False
 120                         self.result.append(self.getpos())
 121         handle_entityref = handle_charref = handle_data = handle_comment = \
 122         handle_decl = handle_pi = unknown_decl = find_startpos
 123
 124         def get_result(self):
 125                 if self.result == None: return None
 126                 if len(self.result) != 3: return None
 127                 lines = self.html.split('\n')
 128                 lines = lines[self.result[1][0]-1:self.result[2][0]]
 129                 lines[0] = lines[0][self.result[1][1]:]
 130                 if len(lines) == 1:
 131                         lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
 132                 lines[-1] = lines[-1][:self.result[2][1]]
 133                 return '\n'.join(lines).strip()
 134
 135 def get_element_by_id(id, html):
 136         """Return the content of the tag with the specified id in the passed HTML document"""
 137         parser = IDParser(id)
 138         try:
 139                 parser.loads(html)
 140         except HTMLParser.HTMLParseError:
 141                 pass
 142         return parser.get_result()
 143
 144
 145 def clean_html(html):
 146         """Clean an HTML snippet into a readable string"""
 147         # Newline vs <br />
 148         html = html.replace('\n', ' ')
 149         html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
 150         # Strip html tags
 151         html = re.sub('<.*?>', '', html)
 152         # Replace html entities
 153         html = unescapeHTML(html)
 154         return html
 155
 156
 157 def sanitize_open(filename, open_mode):
 158         """Try to open the given filename, and slightly tweak it if this fails.
 159
 160         Attempts to open the given filename. If this fails, it tries to change
 161         the filename slightly, step by step, until it's either able to open it
 162         or it fails and raises a final exception, like the standard open()
 163         function.
 164
 165         It returns the tuple (stream, definitive_file_name).
 166         """
 167         try:
 168                 if filename == u'-':
 169                         if sys.platform == 'win32':
 170                                 import msvcrt
 171                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
 172                         return (sys.stdout, filename)
 173                 stream = open(encodeFilename(filename), open_mode)
 174                 return (stream, filename)
 175         except (IOError, OSError), err:
 176                 # In case of error, try to remove win32 forbidden chars
 177                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 178
 179                 # An exception here should be caught in the caller
 180                 stream = open(encodeFilename(filename), open_mode)
 181                 return (stream, filename)
 182
 183
 184 def timeconvert(timestr):
 185         """Convert RFC 2822 defined time string into system timestamp"""
 186         timestamp = None
 187         timetuple = email.utils.parsedate_tz(timestr)
 188         if timetuple is not None:
 189                 timestamp = email.utils.mktime_tz(timetuple)
 190         return timestamp
 191
 192 def sanitize_filename(s):
 193         """Sanitizes a string so it could be used as part of a filename."""
 194         def replace_insane(char):
 195                 if char in u' .\\/|?*<>:"' or ord(char) < 32:
 196                         return '_'
 197                 return char
 198         return u''.join(map(replace_insane, s)).strip('_')
 199
 200 def orderedSet(iterable):
 201         """ Remove all duplicates from the input iterable """
 202         res = []
 203         for el in iterable:
 204                 if el not in res:
 205                         res.append(el)
 206         return res
 207
 208 def unescapeHTML(s):
 209         """
 210         @param s a string (of type unicode)
 211         """
 212         assert type(s) == type(u'')
 213
 214         result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
 215         return result
 216
 217 def encodeFilename(s):
 218         """
 219         @param s The name of the file (of type unicode)
 220         """
 221
 222         assert type(s) == type(u'')
 223
 224         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 225                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 226                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 227                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 228                 return s
 229         else:
 230                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 231
 232 class DownloadError(Exception):
 233         """Download Error exception.
 234
 235         This exception may be thrown by FileDownloader objects if they are not
 236         configured to continue on errors. They will contain the appropriate
 237         error message.
 238         """
 239         pass
 240
 241
 242 class SameFileError(Exception):
 243         """Same File exception.
 244
 245         This exception will be thrown by FileDownloader objects if they detect
 246         multiple files would have to be downloaded to the same file on disk.
 247         """
 248         pass
 249
 250
 251 class PostProcessingError(Exception):
 252         """Post Processing exception.
 253
 254         This exception may be raised by PostProcessor's .run() method to
 255         indicate an error in the postprocessing task.
 256         """
 257         pass
 258
 259 class MaxDownloadsReached(Exception):
 260         """ --max-downloads limit has been reached. """
 261         pass
 262
 263
 264 class UnavailableVideoError(Exception):
 265         """Unavailable Format exception.
 266
 267         This exception will be thrown when a video is requested
 268         in a format that is not available for that video.
 269         """
 270         pass
 271
 272
 273 class ContentTooShortError(Exception):
 274         """Content Too Short exception.
 275
 276         This exception may be raised by FileDownloader objects when a file they
 277         download is too small for what the server announced first, indicating
 278         the connection was probably interrupted.
 279         """
 280         # Both in bytes
 281         downloaded = None
 282         expected = None
 283
 284         def __init__(self, downloaded, expected):
 285                 self.downloaded = downloaded
 286                 self.expected = expected
 287
 288
 289 class Trouble(Exception):
 290         """Trouble helper exception
 291
 292         This is an exception to be handled with
 293         FileDownloader.trouble
 294         """
 295
 296 class YoutubeDLHandler(urllib2.HTTPHandler):
 297         """Handler for HTTP requests and responses.
 298
 299         This class, when installed with an OpenerDirector, automatically adds
 300         the standard headers to every HTTP request and handles gzipped and
 301         deflated responses from web servers. If compression is to be avoided in
 302         a particular request, the original request in the program code only has
 303         to include the HTTP header "Youtubedl-No-Compression", which will be
 304         removed before making the real request.
 305
 306         Part of this code was copied from:
 307
 308         http://techknack.net/python-urllib2-handlers/
 309
 310         Andrew Rowls, the author of that code, agreed to release it to the
 311         public domain.
 312         """
 313
 314         @staticmethod
 315         def deflate(data):
 316                 try:
 317                         return zlib.decompress(data, -zlib.MAX_WBITS)
 318                 except zlib.error:
 319                         return zlib.decompress(data)
 320
 321         @staticmethod
 322         def addinfourl_wrapper(stream, headers, url, code):
 323                 if hasattr(urllib2.addinfourl, 'getcode'):
 324                         return urllib2.addinfourl(stream, headers, url, code)
 325                 ret = urllib2.addinfourl(stream, headers, url)
 326                 ret.code = code
 327                 return ret
 328
 329         def http_request(self, req):
 330                 for h in std_headers:
 331                         if h in req.headers:
 332                                 del req.headers[h]
 333                         req.add_header(h, std_headers[h])
 334                 if 'Youtubedl-no-compression' in req.headers:
 335                         if 'Accept-encoding' in req.headers:
 336                                 del req.headers['Accept-encoding']
 337                         del req.headers['Youtubedl-no-compression']
 338                 return req
 339
 340         def http_response(self, req, resp):
 341                 old_resp = resp
 342                 # gzip
 343                 if resp.headers.get('Content-encoding', '') == 'gzip':
 344                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 345                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 346                         resp.msg = old_resp.msg
 347                 # deflate
 348                 if resp.headers.get('Content-encoding', '') == 'deflate':
 349                         gz = StringIO.StringIO(self.deflate(resp.read()))
 350                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 351                         resp.msg = old_resp.msg
 352                 return resp