2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
29 def preferredencoding():
30 """Get preferred encoding.
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
35 def yield_preferredencoding():
37 pref = locale.getpreferredencoding()
43 return yield_preferredencoding().next()
46 def htmlentity_transform(matchobj):
47 """Transforms an HTML entity to a Unicode character.
49 This function receives a match object and is intended to be used with
50 the re.sub() function.
52 entity = matchobj.group(1)
54 # Known non-numeric HTML entity
55 if entity in htmlentitydefs.name2codepoint:
56 return unichr(htmlentitydefs.name2codepoint[entity])
59 mobj = re.match(ur'(?u)#(x?\d+)', entity)
61 numstr = mobj.group(1)
62 if numstr.startswith(u'x'):
64 numstr = u'0%s' % numstr
67 return unichr(long(numstr, base))
69 # Unknown entity in name, return its literal representation
70 return (u'&%s;' % entity)
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74 """Modified HTMLParser that isolates a tag with the specified id"""
75 def __init__(self, id):
81 self.watch_startpos = False
83 HTMLParser.HTMLParser.__init__(self)
85 def error(self, message):
87 if self.error_count > 10 or self.started:
88 raise HTMLParser.HTMLParseError(message, self.getpos())
89 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
93 def loads(self, html):
98 def handle_starttag(self, tag, attrs):
101 self.find_startpos(None)
102 if 'id' in attrs and attrs['id'] == self.id:
105 self.watch_startpos = True
107 if not tag in self.depth: self.depth[tag] = 0
110 def handle_endtag(self, tag):
112 if tag in self.depth: self.depth[tag] -= 1
113 if self.depth[self.result[0]] == 0:
115 self.result.append(self.getpos())
117 def find_startpos(self, x):
118 """Needed to put the start position of the result (self.result[1])
119 after the opening tag with the requested id"""
120 if self.watch_startpos:
121 self.watch_startpos = False
122 self.result.append(self.getpos())
123 handle_entityref = handle_charref = handle_data = handle_comment = \
124 handle_decl = handle_pi = unknown_decl = find_startpos
126 def get_result(self):
127 if self.result == None: return None
128 if len(self.result) != 3: return None
129 lines = self.html.split('\n')
130 lines = lines[self.result[1][0]-1:self.result[2][0]]
131 lines[0] = lines[0][self.result[1][1]:]
133 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
134 lines[-1] = lines[-1][:self.result[2][1]]
135 return '\n'.join(lines).strip()
137 def get_element_by_id(id, html):
138 """Return the content of the tag with the specified id in the passed HTML document"""
139 parser = IDParser(id)
142 except HTMLParser.HTMLParseError:
144 return parser.get_result()
147 def clean_html(html):
148 """Clean an HTML snippet into a readable string"""
150 html = html.replace('\n', ' ')
151 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
153 html = re.sub('<.*?>', '', html)
154 # Replace html entities
155 html = unescapeHTML(html)
159 def sanitize_title(utitle):
160 """Sanitizes a video title so it could be used as part of a filename."""
161 utitle = unescapeHTML(utitle)
162 return utitle.replace(unicode(os.sep), u'%')
165 def sanitize_open(filename, open_mode):
166 """Try to open the given filename, and slightly tweak it if this fails.
168 Attempts to open the given filename. If this fails, it tries to change
169 the filename slightly, step by step, until it's either able to open it
170 or it fails and raises a final exception, like the standard open()
173 It returns the tuple (stream, definitive_file_name).
177 if sys.platform == 'win32':
179 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
180 return (sys.stdout, filename)
181 stream = open(encodeFilename(filename), open_mode)
182 return (stream, filename)
183 except (IOError, OSError), err:
184 # In case of error, try to remove win32 forbidden chars
185 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
187 # An exception here should be caught in the caller
188 stream = open(encodeFilename(filename), open_mode)
189 return (stream, filename)
192 def timeconvert(timestr):
193 """Convert RFC 2822 defined time string into system timestamp"""
195 timetuple = email.utils.parsedate_tz(timestr)
196 if timetuple is not None:
197 timestamp = email.utils.mktime_tz(timetuple)
200 def simplify_title(title):
201 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
202 return expr.sub(u'_', title).strip(u'_')
204 def orderedSet(iterable):
205 """ Remove all duplicates from the input iterable """
214 @param s a string (of type unicode)
216 assert type(s) == type(u'')
218 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
221 def encodeFilename(s):
223 @param s The name of the file (of type unicode)
226 assert type(s) == type(u'')
228 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
229 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
230 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
231 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
234 return s.encode(sys.getfilesystemencoding(), 'ignore')
236 class DownloadError(Exception):
237 """Download Error exception.
239 This exception may be thrown by FileDownloader objects if they are not
240 configured to continue on errors. They will contain the appropriate
246 class SameFileError(Exception):
247 """Same File exception.
249 This exception will be thrown by FileDownloader objects if they detect
250 multiple files would have to be downloaded to the same file on disk.
255 class PostProcessingError(Exception):
256 """Post Processing exception.
258 This exception may be raised by PostProcessor's .run() method to
259 indicate an error in the postprocessing task.
263 class MaxDownloadsReached(Exception):
264 """ --max-downloads limit has been reached. """
268 class UnavailableVideoError(Exception):
269 """Unavailable Format exception.
271 This exception will be thrown when a video is requested
272 in a format that is not available for that video.
277 class ContentTooShortError(Exception):
278 """Content Too Short exception.
280 This exception may be raised by FileDownloader objects when a file they
281 download is too small for what the server announced first, indicating
282 the connection was probably interrupted.
288 def __init__(self, downloaded, expected):
289 self.downloaded = downloaded
290 self.expected = expected
293 class YoutubeDLHandler(urllib2.HTTPHandler):
294 """Handler for HTTP requests and responses.
296 This class, when installed with an OpenerDirector, automatically adds
297 the standard headers to every HTTP request and handles gzipped and
298 deflated responses from web servers. If compression is to be avoided in
299 a particular request, the original request in the program code only has
300 to include the HTTP header "Youtubedl-No-Compression", which will be
301 removed before making the real request.
303 Part of this code was copied from:
305 http://techknack.net/python-urllib2-handlers/
307 Andrew Rowls, the author of that code, agreed to release it to the
314 return zlib.decompress(data, -zlib.MAX_WBITS)
316 return zlib.decompress(data)
319 def addinfourl_wrapper(stream, headers, url, code):
320 if hasattr(urllib2.addinfourl, 'getcode'):
321 return urllib2.addinfourl(stream, headers, url, code)
322 ret = urllib2.addinfourl(stream, headers, url)
326 def http_request(self, req):
327 for h in std_headers:
330 req.add_header(h, std_headers[h])
331 if 'Youtubedl-no-compression' in req.headers:
332 if 'Accept-encoding' in req.headers:
333 del req.headers['Accept-encoding']
334 del req.headers['Youtubedl-no-compression']
337 def http_response(self, req, resp):
340 if resp.headers.get('Content-encoding', '') == 'gzip':
341 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
342 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
343 resp.msg = old_resp.msg
345 if resp.headers.get('Content-encoding', '') == 'deflate':
346 gz = StringIO.StringIO(self.deflate(resp.read()))
347 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
348 resp.msg = old_resp.msg