2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 import http.cookiejar as compat_cookiejar
31 except ImportError: # Python 2
32 import cookielib as compat_cookiejar
35 import html.entities as compat_html_entities
36 except ImportError: # Python 2
37 import htmlentitydefs as compat_html_entities
40 import html.parser as compat_html_parser
41 except ImportError: # Python 2
42 import HTMLParser as compat_html_parser
45 import http.client as compat_http_client
46 except ImportError: # Python 2
47 import httplib as compat_http_client
50 from urllib.parse import parse_qs as compat_parse_qs
51 except ImportError: # Python 2
52 from urlparse import parse_qs as compat_parse_qs
55 compat_str = unicode # Python 2
60 compat_chr = unichr # Python 2
66 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
67 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
68 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
69 'Accept-Encoding': 'gzip, deflate',
70 'Accept-Language': 'en-us,en;q=0.5',
72 def preferredencoding():
73 """Get preferred encoding.
75 Returns the best encoding scheme for the system, based on
76 locale.getpreferredencoding() and some further tweaks.
79 pref = locale.getpreferredencoding()
87 def htmlentity_transform(matchobj):
88 """Transforms an HTML entity to a character.
90 This function receives a match object and is intended to be used with
91 the re.sub() function.
93 entity = matchobj.group(1)
95 # Known non-numeric HTML entity
96 if entity in compat_html_entities.name2codepoint:
97 return compat_chr(compat_html_entities.name2codepoint[entity])
99 mobj = re.match(u'(?u)#(x?\\d+)', entity)
101 numstr = mobj.group(1)
102 if numstr.startswith(u'x'):
104 numstr = u'0%s' % numstr
107 return compat_chr(int(numstr, base))
109 # Unknown entity in name, return its literal representation
110 return (u'&%s;' % entity)
112 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
113 class IDParser(compat_html_parser.HTMLParser):
114 """Modified HTMLParser that isolates a tag with the specified id"""
115 def __init__(self, id):
121 self.watch_startpos = False
123 compat_html_parser.HTMLParser.__init__(self)
125 def error(self, message):
126 if self.error_count > 10 or self.started:
127 raise compat_html_parser.HTMLParseError(message, self.getpos())
128 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
129 self.error_count += 1
132 def loads(self, html):
137 def handle_starttag(self, tag, attrs):
140 self.find_startpos(None)
141 if 'id' in attrs and attrs['id'] == self.id:
144 self.watch_startpos = True
146 if not tag in self.depth: self.depth[tag] = 0
149 def handle_endtag(self, tag):
151 if tag in self.depth: self.depth[tag] -= 1
152 if self.depth[self.result[0]] == 0:
154 self.result.append(self.getpos())
156 def find_startpos(self, x):
157 """Needed to put the start position of the result (self.result[1])
158 after the opening tag with the requested id"""
159 if self.watch_startpos:
160 self.watch_startpos = False
161 self.result.append(self.getpos())
162 handle_entityref = handle_charref = handle_data = handle_comment = \
163 handle_decl = handle_pi = unknown_decl = find_startpos
165 def get_result(self):
166 if self.result is None:
168 if len(self.result) != 3:
170 lines = self.html.split('\n')
171 lines = lines[self.result[1][0]-1:self.result[2][0]]
172 lines[0] = lines[0][self.result[1][1]:]
174 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
175 lines[-1] = lines[-1][:self.result[2][1]]
176 return '\n'.join(lines).strip()
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified id in the passed HTML document"""
180 parser = IDParser(id)
183 except compat_html_parser.HTMLParseError:
185 return parser.get_result()
188 def clean_html(html):
189 """Clean an HTML snippet into a readable string"""
191 html = html.replace('\n', ' ')
192 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
194 html = re.sub('<.*?>', '', html)
195 # Replace html entities
196 html = unescapeHTML(html)
200 def sanitize_open(filename, open_mode):
201 """Try to open the given filename, and slightly tweak it if this fails.
203 Attempts to open the given filename. If this fails, it tries to change
204 the filename slightly, step by step, until it's either able to open it
205 or it fails and raises a final exception, like the standard open()
208 It returns the tuple (stream, definitive_file_name).
212 if sys.platform == 'win32':
214 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
215 return (sys.stdout, filename)
216 stream = open(encodeFilename(filename), open_mode)
217 return (stream, filename)
218 except (IOError, OSError) as err:
219 # In case of error, try to remove win32 forbidden chars
220 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
222 # An exception here should be caught in the caller
223 stream = open(encodeFilename(filename), open_mode)
224 return (stream, filename)
227 def timeconvert(timestr):
228 """Convert RFC 2822 defined time string into system timestamp"""
230 timetuple = email.utils.parsedate_tz(timestr)
231 if timetuple is not None:
232 timestamp = email.utils.mktime_tz(timetuple)
235 def sanitize_filename(s, restricted=False):
236 """Sanitizes a string so it could be used as part of a filename.
237 If restricted is set, use a stricter subset of allowed characters.
239 def replace_insane(char):
240 if char == '?' or ord(char) < 32 or ord(char) == 127:
243 return '' if restricted else '\''
245 return '_-' if restricted else ' -'
246 elif char in '\\/|*<>':
248 if restricted and (char in '!&\'' or char.isspace()):
250 if restricted and ord(char) > 127:
254 result = u''.join(map(replace_insane, s))
255 while '__' in result:
256 result = result.replace('__', '_')
257 result = result.strip('_')
258 # Common case of "Foreign band name - English song title"
259 if restricted and result.startswith('-_'):
265 def orderedSet(iterable):
266 """ Remove all duplicates from the input iterable """
277 assert type(s) == type(u'')
279 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
282 def encodeFilename(s):
284 @param s The name of the file
287 assert type(s) == type(u'')
289 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
290 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
291 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
292 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
295 return s.encode(sys.getfilesystemencoding(), 'ignore')
297 class DownloadError(Exception):
298 """Download Error exception.
300 This exception may be thrown by FileDownloader objects if they are not
301 configured to continue on errors. They will contain the appropriate
307 class SameFileError(Exception):
308 """Same File exception.
310 This exception will be thrown by FileDownloader objects if they detect
311 multiple files would have to be downloaded to the same file on disk.
316 class PostProcessingError(Exception):
317 """Post Processing exception.
319 This exception may be raised by PostProcessor's .run() method to
320 indicate an error in the postprocessing task.
324 class MaxDownloadsReached(Exception):
325 """ --max-downloads limit has been reached. """
329 class UnavailableVideoError(Exception):
330 """Unavailable Format exception.
332 This exception will be thrown when a video is requested
333 in a format that is not available for that video.
338 class ContentTooShortError(Exception):
339 """Content Too Short exception.
341 This exception may be raised by FileDownloader objects when a file they
342 download is too small for what the server announced first, indicating
343 the connection was probably interrupted.
349 def __init__(self, downloaded, expected):
350 self.downloaded = downloaded
351 self.expected = expected
354 class Trouble(Exception):
355 """Trouble helper exception
357 This is an exception to be handled with
358 FileDownloader.trouble
361 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
362 """Handler for HTTP requests and responses.
364 This class, when installed with an OpenerDirector, automatically adds
365 the standard headers to every HTTP request and handles gzipped and
366 deflated responses from web servers. If compression is to be avoided in
367 a particular request, the original request in the program code only has
368 to include the HTTP header "Youtubedl-No-Compression", which will be
369 removed before making the real request.
371 Part of this code was copied from:
373 http://techknack.net/python-urllib2-handlers/
375 Andrew Rowls, the author of that code, agreed to release it to the
382 return zlib.decompress(data, -zlib.MAX_WBITS)
384 return zlib.decompress(data)
387 def addinfourl_wrapper(stream, headers, url, code):
388 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
389 return compat_urllib_request.addinfourl(stream, headers, url, code)
390 ret = compat_urllib_request.addinfourl(stream, headers, url)
394 def http_request(self, req):
395 for h in std_headers:
398 req.add_header(h, std_headers[h])
399 if 'Youtubedl-no-compression' in req.headers:
400 if 'Accept-encoding' in req.headers:
401 del req.headers['Accept-encoding']
402 del req.headers['Youtubedl-no-compression']
405 def http_response(self, req, resp):
408 if resp.headers.get('Content-encoding', '') == 'gzip':
409 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
410 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
411 resp.msg = old_resp.msg
413 if resp.headers.get('Content-encoding', '') == 'deflate':
414 gz = io.BytesIO(self.deflate(resp.read()))
415 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
416 resp.msg = old_resp.msg