2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
29 def preferredencoding():
30 """Get preferred encoding.
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
36 pref = locale.getpreferredencoding()
44 def htmlentity_transform(matchobj):
45 """Transforms an HTML entity to a Unicode character.
47 This function receives a match object and is intended to be used with
48 the re.sub() function.
50 entity = matchobj.group(1)
52 # Known non-numeric HTML entity
53 if entity in htmlentitydefs.name2codepoint:
54 return unichr(htmlentitydefs.name2codepoint[entity])
57 mobj = re.match(ur'(?u)#(x?\d+)', entity)
59 numstr = mobj.group(1)
60 if numstr.startswith(u'x'):
62 numstr = u'0%s' % numstr
65 return unichr(long(numstr, base))
67 # Unknown entity in name, return its literal representation
68 return (u'&%s;' % entity)
70 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
71 class IDParser(HTMLParser.HTMLParser):
72 """Modified HTMLParser that isolates a tag with the specified id"""
73 def __init__(self, id):
79 self.watch_startpos = False
81 HTMLParser.HTMLParser.__init__(self)
83 def error(self, message):
84 print >> sys.stderr, self.getpos()
85 if self.error_count > 10 or self.started:
86 raise HTMLParser.HTMLParseError(message, self.getpos())
87 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
91 def loads(self, html):
96 def handle_starttag(self, tag, attrs):
99 self.find_startpos(None)
100 if 'id' in attrs and attrs['id'] == self.id:
103 self.watch_startpos = True
105 if not tag in self.depth: self.depth[tag] = 0
108 def handle_endtag(self, tag):
110 if tag in self.depth: self.depth[tag] -= 1
111 if self.depth[self.result[0]] == 0:
113 self.result.append(self.getpos())
115 def find_startpos(self, x):
116 """Needed to put the start position of the result (self.result[1])
117 after the opening tag with the requested id"""
118 if self.watch_startpos:
119 self.watch_startpos = False
120 self.result.append(self.getpos())
121 handle_entityref = handle_charref = handle_data = handle_comment = \
122 handle_decl = handle_pi = unknown_decl = find_startpos
124 def get_result(self):
125 if self.result == None: return None
126 if len(self.result) != 3: return None
127 lines = self.html.split('\n')
128 lines = lines[self.result[1][0]-1:self.result[2][0]]
129 lines[0] = lines[0][self.result[1][1]:]
131 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
132 lines[-1] = lines[-1][:self.result[2][1]]
133 return '\n'.join(lines).strip()
135 def get_element_by_id(id, html):
136 """Return the content of the tag with the specified id in the passed HTML document"""
137 parser = IDParser(id)
140 except HTMLParser.HTMLParseError:
142 return parser.get_result()
145 def clean_html(html):
146 """Clean an HTML snippet into a readable string"""
148 html = html.replace('\n', ' ')
149 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
151 html = re.sub('<.*?>', '', html)
152 # Replace html entities
153 html = unescapeHTML(html)
157 def sanitize_open(filename, open_mode):
158 """Try to open the given filename, and slightly tweak it if this fails.
160 Attempts to open the given filename. If this fails, it tries to change
161 the filename slightly, step by step, until it's either able to open it
162 or it fails and raises a final exception, like the standard open()
165 It returns the tuple (stream, definitive_file_name).
169 if sys.platform == 'win32':
171 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
172 return (sys.stdout, filename)
173 stream = open(encodeFilename(filename), open_mode)
174 return (stream, filename)
175 except (IOError, OSError), err:
176 # In case of error, try to remove win32 forbidden chars
177 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
179 # An exception here should be caught in the caller
180 stream = open(encodeFilename(filename), open_mode)
181 return (stream, filename)
184 def timeconvert(timestr):
185 """Convert RFC 2822 defined time string into system timestamp"""
187 timetuple = email.utils.parsedate_tz(timestr)
188 if timetuple is not None:
189 timestamp = email.utils.mktime_tz(timetuple)
192 def sanitize_filename(s):
193 """Sanitizes a string so it could be used as part of a filename."""
194 def replace_insane(char):
195 if char in u' .\\/|?*<>:"' or ord(char) < 32:
198 return u''.join(map(replace_insane, s)).strip('_')
200 def orderedSet(iterable):
201 """ Remove all duplicates from the input iterable """
210 @param s a string (of type unicode)
212 assert type(s) == type(u'')
214 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
217 def encodeFilename(s):
219 @param s The name of the file (of type unicode)
222 assert type(s) == type(u'')
224 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
225 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
226 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
227 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
230 return s.encode(sys.getfilesystemencoding(), 'ignore')
232 class DownloadError(Exception):
233 """Download Error exception.
235 This exception may be thrown by FileDownloader objects if they are not
236 configured to continue on errors. They will contain the appropriate
242 class SameFileError(Exception):
243 """Same File exception.
245 This exception will be thrown by FileDownloader objects if they detect
246 multiple files would have to be downloaded to the same file on disk.
251 class PostProcessingError(Exception):
252 """Post Processing exception.
254 This exception may be raised by PostProcessor's .run() method to
255 indicate an error in the postprocessing task.
259 class MaxDownloadsReached(Exception):
260 """ --max-downloads limit has been reached. """
264 class UnavailableVideoError(Exception):
265 """Unavailable Format exception.
267 This exception will be thrown when a video is requested
268 in a format that is not available for that video.
273 class ContentTooShortError(Exception):
274 """Content Too Short exception.
276 This exception may be raised by FileDownloader objects when a file they
277 download is too small for what the server announced first, indicating
278 the connection was probably interrupted.
284 def __init__(self, downloaded, expected):
285 self.downloaded = downloaded
286 self.expected = expected
289 class Trouble(Exception):
290 """Trouble helper exception
292 This is an exception to be handled with
293 FileDownloader.trouble
296 class YoutubeDLHandler(urllib2.HTTPHandler):
297 """Handler for HTTP requests and responses.
299 This class, when installed with an OpenerDirector, automatically adds
300 the standard headers to every HTTP request and handles gzipped and
301 deflated responses from web servers. If compression is to be avoided in
302 a particular request, the original request in the program code only has
303 to include the HTTP header "Youtubedl-No-Compression", which will be
304 removed before making the real request.
306 Part of this code was copied from:
308 http://techknack.net/python-urllib2-handlers/
310 Andrew Rowls, the author of that code, agreed to release it to the
317 return zlib.decompress(data, -zlib.MAX_WBITS)
319 return zlib.decompress(data)
322 def addinfourl_wrapper(stream, headers, url, code):
323 if hasattr(urllib2.addinfourl, 'getcode'):
324 return urllib2.addinfourl(stream, headers, url, code)
325 ret = urllib2.addinfourl(stream, headers, url)
329 def http_request(self, req):
330 for h in std_headers:
333 req.add_header(h, std_headers[h])
334 if 'Youtubedl-no-compression' in req.headers:
335 if 'Accept-encoding' in req.headers:
336 del req.headers['Accept-encoding']
337 del req.headers['Youtubedl-no-compression']
340 def http_response(self, req, resp):
343 if resp.headers.get('Content-encoding', '') == 'gzip':
344 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
345 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
346 resp.msg = old_resp.msg
348 if resp.headers.get('Content-encoding', '') == 'deflate':
349 gz = StringIO.StringIO(self.deflate(resp.read()))
350 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
351 resp.msg = old_resp.msg