youtube_dl/utils.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 import gzip
   5 import htmlentitydefs
   6 import HTMLParser
   7 import locale
   8 import os
   9 import re
  10 import sys
  11 import zlib
  12 import urllib2
  13 import email.utils
  14
  15 try:
  16         import cStringIO as StringIO
  17 except ImportError:
  18         import StringIO
  19
  20 try:
  21         import json
  22 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
  23         import trivialjson as json
  24
  25 std_headers = {
  26         'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
  27         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  28         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  29         'Accept-Encoding': 'gzip, deflate',
  30         'Accept-Language': 'en-us,en;q=0.5',
  31 }
  32
  33 def preferredencoding():
  34         """Get preferred encoding.
  35
  36         Returns the best encoding scheme for the system, based on
  37         locale.getpreferredencoding() and some further tweaks.
  38         """
  39         def yield_preferredencoding():
  40                 try:
  41                         pref = locale.getpreferredencoding()
  42                         u'TEST'.encode(pref)
  43                 except:
  44                         pref = 'UTF-8'
  45                 while True:
  46                         yield pref
  47         return yield_preferredencoding().next()
  48
  49
  50 def htmlentity_transform(matchobj):
  51         """Transforms an HTML entity to a Unicode character.
  52
  53         This function receives a match object and is intended to be used with
  54         the re.sub() function.
  55         """
  56         entity = matchobj.group(1)
  57
  58         # Known non-numeric HTML entity
  59         if entity in htmlentitydefs.name2codepoint:
  60                 return unichr(htmlentitydefs.name2codepoint[entity])
  61
  62         # Unicode character
  63         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  64         if mobj is not None:
  65                 numstr = mobj.group(1)
  66                 if numstr.startswith(u'x'):
  67                         base = 16
  68                         numstr = u'0%s' % numstr
  69                 else:
  70                         base = 10
  71                 return unichr(long(numstr, base))
  72
  73         # Unknown entity in name, return its literal representation
  74         return (u'&%s;' % entity)
  75
  76
  77 def sanitize_title(utitle):
  78         """Sanitizes a video title so it could be used as part of a filename."""
  79         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  80         return utitle.replace(unicode(os.sep), u'%')
  81
  82
  83 def sanitize_open(filename, open_mode):
  84         """Try to open the given filename, and slightly tweak it if this fails.
  85
  86         Attempts to open the given filename. If this fails, it tries to change
  87         the filename slightly, step by step, until it's either able to open it
  88         or it fails and raises a final exception, like the standard open()
  89         function.
  90
  91         It returns the tuple (stream, definitive_file_name).
  92         """
  93         try:
  94                 if filename == u'-':
  95                         if sys.platform == 'win32':
  96                                 import msvcrt
  97                                 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
  98                         return (sys.stdout, filename)
  99                 stream = open(encodeFilename(filename), open_mode)
 100                 return (stream, filename)
 101         except (IOError, OSError), err:
 102                 # In case of error, try to remove win32 forbidden chars
 103                 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
 104
 105                 # An exception here should be caught in the caller
 106                 stream = open(encodeFilename(filename), open_mode)
 107                 return (stream, filename)
 108
 109
 110 def timeconvert(timestr):
 111         """Convert RFC 2822 defined time string into system timestamp"""
 112         timestamp = None
 113         timetuple = email.utils.parsedate_tz(timestr)
 114         if timetuple is not None:
 115                 timestamp = email.utils.mktime_tz(timetuple)
 116         return timestamp
 117
 118 def simplify_title(title):
 119         expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
 120         return expr.sub(u'_', title).strip(u'_')
 121
 122 def orderedSet(iterable):
 123         """ Remove all duplicates from the input iterable """
 124         res = []
 125         for el in iterable:
 126                 if el not in res:
 127                         res.append(el)
 128         return res
 129
 130 def unescapeHTML(s):
 131         """
 132         @param s a string (of type unicode)
 133         """
 134         assert type(s) == type(u'')
 135
 136         htmlParser = HTMLParser.HTMLParser()
 137         return htmlParser.unescape(s)
 138
 139 def encodeFilename(s):
 140         """
 141         @param s The name of the file (of type unicode)
 142         """
 143
 144         assert type(s) == type(u'')
 145
 146         if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
 147                 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
 148                 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
 149                 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
 150                 return s
 151         else:
 152                 return s.encode(sys.getfilesystemencoding(), 'ignore')
 153
 154 class DownloadError(Exception):
 155         """Download Error exception.
 156
 157         This exception may be thrown by FileDownloader objects if they are not
 158         configured to continue on errors. They will contain the appropriate
 159         error message.
 160         """
 161         pass
 162
 163
 164 class SameFileError(Exception):
 165         """Same File exception.
 166
 167         This exception will be thrown by FileDownloader objects if they detect
 168         multiple files would have to be downloaded to the same file on disk.
 169         """
 170         pass
 171
 172
 173 class PostProcessingError(Exception):
 174         """Post Processing exception.
 175
 176         This exception may be raised by PostProcessor's .run() method to
 177         indicate an error in the postprocessing task.
 178         """
 179         pass
 180
 181 class MaxDownloadsReached(Exception):
 182         """ --max-downloads limit has been reached. """
 183         pass
 184
 185
 186 class UnavailableVideoError(Exception):
 187         """Unavailable Format exception.
 188
 189         This exception will be thrown when a video is requested
 190         in a format that is not available for that video.
 191         """
 192         pass
 193
 194
 195 class ContentTooShortError(Exception):
 196         """Content Too Short exception.
 197
 198         This exception may be raised by FileDownloader objects when a file they
 199         download is too small for what the server announced first, indicating
 200         the connection was probably interrupted.
 201         """
 202         # Both in bytes
 203         downloaded = None
 204         expected = None
 205
 206         def __init__(self, downloaded, expected):
 207                 self.downloaded = downloaded
 208                 self.expected = expected
 209
 210
 211 class YoutubeDLHandler(urllib2.HTTPHandler):
 212         """Handler for HTTP requests and responses.
 213
 214         This class, when installed with an OpenerDirector, automatically adds
 215         the standard headers to every HTTP request and handles gzipped and
 216         deflated responses from web servers. If compression is to be avoided in
 217         a particular request, the original request in the program code only has
 218         to include the HTTP header "Youtubedl-No-Compression", which will be
 219         removed before making the real request.
 220
 221         Part of this code was copied from:
 222
 223         http://techknack.net/python-urllib2-handlers/
 224
 225         Andrew Rowls, the author of that code, agreed to release it to the
 226         public domain.
 227         """
 228
 229         @staticmethod
 230         def deflate(data):
 231                 try:
 232                         return zlib.decompress(data, -zlib.MAX_WBITS)
 233                 except zlib.error:
 234                         return zlib.decompress(data)
 235
 236         @staticmethod
 237         def addinfourl_wrapper(stream, headers, url, code):
 238                 if hasattr(urllib2.addinfourl, 'getcode'):
 239                         return urllib2.addinfourl(stream, headers, url, code)
 240                 ret = urllib2.addinfourl(stream, headers, url)
 241                 ret.code = code
 242                 return ret
 243
 244         def http_request(self, req):
 245                 for h in std_headers:
 246                         if h in req.headers:
 247                                 del req.headers[h]
 248                         req.add_header(h, std_headers[h])
 249                 if 'Youtubedl-no-compression' in req.headers:
 250                         if 'Accept-encoding' in req.headers:
 251                                 del req.headers['Accept-encoding']
 252                         del req.headers['Youtubedl-no-compression']
 253                 return req
 254
 255         def http_response(self, req, resp):
 256                 old_resp = resp
 257                 # gzip
 258                 if resp.headers.get('Content-encoding', '') == 'gzip':
 259                         gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
 260                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 261                         resp.msg = old_resp.msg
 262                 # deflate
 263                 if resp.headers.get('Content-encoding', '') == 'deflate':
 264                         gz = StringIO.StringIO(self.deflate(resp.read()))
 265                         resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
 266                         resp.msg = old_resp.msg
 267                 return resp