2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
29 def preferredencoding():
30 """Get preferred encoding.
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
35 def yield_preferredencoding():
37 pref = locale.getpreferredencoding()
43 return yield_preferredencoding().next()
46 def htmlentity_transform(matchobj):
47 """Transforms an HTML entity to a Unicode character.
49 This function receives a match object and is intended to be used with
50 the re.sub() function.
52 entity = matchobj.group(1)
54 # Known non-numeric HTML entity
55 if entity in htmlentitydefs.name2codepoint:
56 return unichr(htmlentitydefs.name2codepoint[entity])
59 mobj = re.match(ur'(?u)#(x?\d+)', entity)
61 numstr = mobj.group(1)
62 if numstr.startswith(u'x'):
64 numstr = u'0%s' % numstr
67 return unichr(long(numstr, base))
69 # Unknown entity in name, return its literal representation
70 return (u'&%s;' % entity)
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74 """Modified HTMLParser that isolates a tag with the specified id"""
75 def __init__(self, id):
81 self.watch_startpos = False
83 HTMLParser.HTMLParser.__init__(self)
85 def error(self, message):
86 if self.error_count > 10 or self.started:
87 raise HTMLParser.HTMLParseError(message, self.getpos())
88 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
92 def loads(self, html):
97 def handle_starttag(self, tag, attrs):
100 self.find_startpos(None)
101 if 'id' in attrs and attrs['id'] == self.id:
104 self.watch_startpos = True
106 if not tag in self.depth: self.depth[tag] = 0
109 def handle_endtag(self, tag):
111 if tag in self.depth: self.depth[tag] -= 1
112 if self.depth[self.result[0]] == 0:
114 self.result.append(self.getpos())
116 def find_startpos(self, x):
117 """Needed to put the start position of the result (self.result[1])
118 after the opening tag with the requested id"""
119 if self.watch_startpos:
120 self.watch_startpos = False
121 self.result.append(self.getpos())
122 handle_entityref = handle_charref = handle_data = handle_comment = \
123 handle_decl = handle_pi = unknown_decl = find_startpos
125 def get_result(self):
126 if self.result == None: return None
127 if len(self.result) != 3: return None
128 lines = self.html.split('\n')
129 lines = lines[self.result[1][0]-1:self.result[2][0]]
130 lines[0] = lines[0][self.result[1][1]:]
132 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
133 lines[-1] = lines[-1][:self.result[2][1]]
134 return '\n'.join(lines).strip()
136 def get_element_by_id(id, html):
137 """Return the content of the tag with the specified id in the passed HTML document"""
138 parser = IDParser(id)
141 except HTMLParser.HTMLParseError:
143 return parser.get_result()
146 def clean_html(html):
147 """Clean an HTML snippet into a readable string"""
149 html = html.replace('\n', ' ')
150 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
152 html = re.sub('<.*?>', '', html)
153 # Replace html entities
154 html = unescapeHTML(html)
158 def sanitize_open(filename, open_mode):
159 """Try to open the given filename, and slightly tweak it if this fails.
161 Attempts to open the given filename. If this fails, it tries to change
162 the filename slightly, step by step, until it's either able to open it
163 or it fails and raises a final exception, like the standard open()
166 It returns the tuple (stream, definitive_file_name).
170 if sys.platform == 'win32':
172 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
173 return (sys.stdout, filename)
174 stream = open(encodeFilename(filename), open_mode)
175 return (stream, filename)
176 except (IOError, OSError), err:
177 # In case of error, try to remove win32 forbidden chars
178 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
180 # An exception here should be caught in the caller
181 stream = open(encodeFilename(filename), open_mode)
182 return (stream, filename)
185 def timeconvert(timestr):
186 """Convert RFC 2822 defined time string into system timestamp"""
188 timetuple = email.utils.parsedate_tz(timestr)
189 if timetuple is not None:
190 timestamp = email.utils.mktime_tz(timetuple)
193 def sanitize_filename(s):
194 """Sanitizes a string so it could be used as part of a filename."""
195 def replace_insane(char):
196 if char == '?' or ord(char) < 32 or ord(char) == 127:
202 elif char in '\\/|*<>':
206 result = u''.join(map(replace_insane, s))
207 while '--' in result:
208 result = result.replace('--', '-')
209 return result.strip('-')
211 def orderedSet(iterable):
212 """ Remove all duplicates from the input iterable """
221 @param s a string (of type unicode)
223 assert type(s) == type(u'')
225 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
228 def encodeFilename(s):
230 @param s The name of the file (of type unicode)
233 assert type(s) == type(u'')
235 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
236 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
237 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
238 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
241 return s.encode(sys.getfilesystemencoding(), 'ignore')
243 class DownloadError(Exception):
244 """Download Error exception.
246 This exception may be thrown by FileDownloader objects if they are not
247 configured to continue on errors. They will contain the appropriate
253 class SameFileError(Exception):
254 """Same File exception.
256 This exception will be thrown by FileDownloader objects if they detect
257 multiple files would have to be downloaded to the same file on disk.
262 class PostProcessingError(Exception):
263 """Post Processing exception.
265 This exception may be raised by PostProcessor's .run() method to
266 indicate an error in the postprocessing task.
270 class MaxDownloadsReached(Exception):
271 """ --max-downloads limit has been reached. """
275 class UnavailableVideoError(Exception):
276 """Unavailable Format exception.
278 This exception will be thrown when a video is requested
279 in a format that is not available for that video.
284 class ContentTooShortError(Exception):
285 """Content Too Short exception.
287 This exception may be raised by FileDownloader objects when a file they
288 download is too small for what the server announced first, indicating
289 the connection was probably interrupted.
295 def __init__(self, downloaded, expected):
296 self.downloaded = downloaded
297 self.expected = expected
300 class Trouble(Exception):
301 """Trouble helper exception
303 This is an exception to be handled with
304 FileDownloader.trouble
307 class YoutubeDLHandler(urllib2.HTTPHandler):
308 """Handler for HTTP requests and responses.
310 This class, when installed with an OpenerDirector, automatically adds
311 the standard headers to every HTTP request and handles gzipped and
312 deflated responses from web servers. If compression is to be avoided in
313 a particular request, the original request in the program code only has
314 to include the HTTP header "Youtubedl-No-Compression", which will be
315 removed before making the real request.
317 Part of this code was copied from:
319 http://techknack.net/python-urllib2-handlers/
321 Andrew Rowls, the author of that code, agreed to release it to the
328 return zlib.decompress(data, -zlib.MAX_WBITS)
330 return zlib.decompress(data)
333 def addinfourl_wrapper(stream, headers, url, code):
334 if hasattr(urllib2.addinfourl, 'getcode'):
335 return urllib2.addinfourl(stream, headers, url, code)
336 ret = urllib2.addinfourl(stream, headers, url)
340 def http_request(self, req):
341 for h in std_headers:
344 req.add_header(h, std_headers[h])
345 if 'Youtubedl-no-compression' in req.headers:
346 if 'Accept-encoding' in req.headers:
347 del req.headers['Accept-encoding']
348 del req.headers['Youtubedl-no-compression']
351 def http_response(self, req, resp):
354 if resp.headers.get('Content-encoding', '') == 'gzip':
355 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
356 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
357 resp.msg = old_resp.msg
359 if resp.headers.get('Content-encoding', '') == 'deflate':
360 gz = StringIO.StringIO(self.deflate(resp.read()))
361 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
362 resp.msg = old_resp.msg