2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 import http.cookiejar as compat_cookiejar
31 except ImportError: # Python 2
32 import cookielib as compat_cookiejar
35 import html.entities as compat_html_entities
36 except NameError: # Python 2
37 import htmlentitydefs as compat_html_entities
40 import html.parser as compat_html_parser
41 except NameError: # Python 2
42 import HTMLParser as compat_html_parser
45 compat_str = unicode # Python 2
50 compat_chr = unichr # Python 2
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
62 def preferredencoding():
63 """Get preferred encoding.
65 Returns the best encoding scheme for the system, based on
66 locale.getpreferredencoding() and some further tweaks.
69 pref = locale.getpreferredencoding()
77 def htmlentity_transform(matchobj):
78 """Transforms an HTML entity to a character.
80 This function receives a match object and is intended to be used with
81 the re.sub() function.
83 entity = matchobj.group(1)
85 # Known non-numeric HTML entity
86 if entity in compat_html_entities.name2codepoint:
87 return compat_chr(compat_html_entities.name2codepoint[entity])
89 mobj = re.match(u'(?u)#(x?\\d+)', entity)
91 numstr = mobj.group(1)
92 if numstr.startswith(u'x'):
94 numstr = u'0%s' % numstr
97 return compat_chr(int(numstr, base))
99 # Unknown entity in name, return its literal representation
100 return (u'&%s;' % entity)
102 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
103 class IDParser(compat_html_parser.HTMLParser):
104 """Modified HTMLParser that isolates a tag with the specified id"""
105 def __init__(self, id):
111 self.watch_startpos = False
113 compat_html_parser.HTMLParser.__init__(self)
115 def error(self, message):
116 if self.error_count > 10 or self.started:
117 raise compat_html_parser.HTMLParseError(message, self.getpos())
118 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
119 self.error_count += 1
122 def loads(self, html):
127 def handle_starttag(self, tag, attrs):
130 self.find_startpos(None)
131 if 'id' in attrs and attrs['id'] == self.id:
134 self.watch_startpos = True
136 if not tag in self.depth: self.depth[tag] = 0
139 def handle_endtag(self, tag):
141 if tag in self.depth: self.depth[tag] -= 1
142 if self.depth[self.result[0]] == 0:
144 self.result.append(self.getpos())
146 def find_startpos(self, x):
147 """Needed to put the start position of the result (self.result[1])
148 after the opening tag with the requested id"""
149 if self.watch_startpos:
150 self.watch_startpos = False
151 self.result.append(self.getpos())
152 handle_entityref = handle_charref = handle_data = handle_comment = \
153 handle_decl = handle_pi = unknown_decl = find_startpos
155 def get_result(self):
156 if self.result is None:
158 if len(self.result) != 3:
160 lines = self.html.split('\n')
161 lines = lines[self.result[1][0]-1:self.result[2][0]]
162 lines[0] = lines[0][self.result[1][1]:]
164 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
165 lines[-1] = lines[-1][:self.result[2][1]]
166 return '\n'.join(lines).strip()
168 def get_element_by_id(id, html):
169 """Return the content of the tag with the specified id in the passed HTML document"""
170 parser = IDParser(id)
173 except compat_html_parser.HTMLParseError:
175 return parser.get_result()
178 def clean_html(html):
179 """Clean an HTML snippet into a readable string"""
181 html = html.replace('\n', ' ')
182 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
184 html = re.sub('<.*?>', '', html)
185 # Replace html entities
186 html = unescapeHTML(html)
190 def sanitize_open(filename, open_mode):
191 """Try to open the given filename, and slightly tweak it if this fails.
193 Attempts to open the given filename. If this fails, it tries to change
194 the filename slightly, step by step, until it's either able to open it
195 or it fails and raises a final exception, like the standard open()
198 It returns the tuple (stream, definitive_file_name).
202 if sys.platform == 'win32':
204 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
205 return (sys.stdout, filename)
206 stream = open(encodeFilename(filename), open_mode)
207 return (stream, filename)
208 except (IOError, OSError) as err:
209 # In case of error, try to remove win32 forbidden chars
210 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
212 # An exception here should be caught in the caller
213 stream = open(encodeFilename(filename), open_mode)
214 return (stream, filename)
217 def timeconvert(timestr):
218 """Convert RFC 2822 defined time string into system timestamp"""
220 timetuple = email.utils.parsedate_tz(timestr)
221 if timetuple is not None:
222 timestamp = email.utils.mktime_tz(timetuple)
225 def sanitize_filename(s, restricted=False):
226 """Sanitizes a string so it could be used as part of a filename.
227 If restricted is set, use a stricter subset of allowed characters.
229 def replace_insane(char):
230 if char == '?' or ord(char) < 32 or ord(char) == 127:
233 return '' if restricted else '\''
235 return '_-' if restricted else ' -'
236 elif char in '\\/|*<>':
238 if restricted and (char in '!&\'' or char.isspace()):
240 if restricted and ord(char) > 127:
244 result = u''.join(map(replace_insane, s))
245 while '__' in result:
246 result = result.replace('__', '_')
247 result = result.strip('_')
248 # Common case of "Foreign band name - English song title"
249 if restricted and result.startswith('-_'):
255 def orderedSet(iterable):
256 """ Remove all duplicates from the input iterable """
267 assert type(s) == type(u'')
269 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
272 def encodeFilename(s):
274 @param s The name of the file
277 assert type(s) == type(u'')
279 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
280 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
281 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
282 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
285 return s.encode(sys.getfilesystemencoding(), 'ignore')
287 class DownloadError(Exception):
288 """Download Error exception.
290 This exception may be thrown by FileDownloader objects if they are not
291 configured to continue on errors. They will contain the appropriate
297 class SameFileError(Exception):
298 """Same File exception.
300 This exception will be thrown by FileDownloader objects if they detect
301 multiple files would have to be downloaded to the same file on disk.
306 class PostProcessingError(Exception):
307 """Post Processing exception.
309 This exception may be raised by PostProcessor's .run() method to
310 indicate an error in the postprocessing task.
314 class MaxDownloadsReached(Exception):
315 """ --max-downloads limit has been reached. """
319 class UnavailableVideoError(Exception):
320 """Unavailable Format exception.
322 This exception will be thrown when a video is requested
323 in a format that is not available for that video.
328 class ContentTooShortError(Exception):
329 """Content Too Short exception.
331 This exception may be raised by FileDownloader objects when a file they
332 download is too small for what the server announced first, indicating
333 the connection was probably interrupted.
339 def __init__(self, downloaded, expected):
340 self.downloaded = downloaded
341 self.expected = expected
344 class Trouble(Exception):
345 """Trouble helper exception
347 This is an exception to be handled with
348 FileDownloader.trouble
351 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
352 """Handler for HTTP requests and responses.
354 This class, when installed with an OpenerDirector, automatically adds
355 the standard headers to every HTTP request and handles gzipped and
356 deflated responses from web servers. If compression is to be avoided in
357 a particular request, the original request in the program code only has
358 to include the HTTP header "Youtubedl-No-Compression", which will be
359 removed before making the real request.
361 Part of this code was copied from:
363 http://techknack.net/python-urllib2-handlers/
365 Andrew Rowls, the author of that code, agreed to release it to the
372 return zlib.decompress(data, -zlib.MAX_WBITS)
374 return zlib.decompress(data)
377 def addinfourl_wrapper(stream, headers, url, code):
378 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
379 return compat_urllib_request.addinfourl(stream, headers, url, code)
380 ret = compat_urllib_request.addinfourl(stream, headers, url)
384 def http_request(self, req):
385 for h in std_headers:
388 req.add_header(h, std_headers[h])
389 if 'Youtubedl-no-compression' in req.headers:
390 if 'Accept-encoding' in req.headers:
391 del req.headers['Accept-encoding']
392 del req.headers['Youtubedl-no-compression']
395 def http_response(self, req, resp):
398 if resp.headers.get('Content-encoding', '') == 'gzip':
399 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
400 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
401 resp.msg = old_resp.msg
403 if resp.headers.get('Content-encoding', '') == 'deflate':
404 gz = io.BytesIO(self.deflate(resp.read()))
405 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
406 resp.msg = old_resp.msg