2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 import http.cookiejar as compat_cookiejar
31 except ImportError: # Python 2
32 import cookielib as compat_cookiejar
35 import html.entities as compat_html_entities
36 except ImportError: # Python 2
37 import htmlentitydefs as compat_html_entities
40 import html.parser as compat_html_parser
41 except ImportError: # Python 2
42 import HTMLParser as compat_html_parser
45 import http.client as compat_http_client
46 except ImportError: # Python 2
47 import httplib as compat_http_client
50 from urllib.parse import parse_qs as compat_parse_qs
51 except ImportError: # Python 2
52 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
53 # Python 2's version is apparently totally broken
54 def _unquote(string, encoding='utf-8', errors='replace'):
57 res = string.split('%')
64 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
71 pct_sequence += item[:2].decode('hex')
74 # This segment was just a single percent-encoded character.
75 # May be part of a sequence of code units, so delay decoding.
76 # (Stored in pct_sequence).
80 # Encountered non-percent-encoded characters. Flush the current
82 string += pct_sequence.decode(encoding, errors) + rest
85 # Flush the final pct_sequence
86 string += pct_sequence.decode(encoding, errors)
89 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
90 encoding='utf-8', errors='replace'):
91 qs, _coerce_result = qs, unicode
92 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
94 for name_value in pairs:
95 if not name_value and not strict_parsing:
97 nv = name_value.split('=', 1)
100 raise ValueError("bad query field: %r" % (name_value,))
101 # Handle case of a control-name with no equal sign
102 if keep_blank_values:
106 if len(nv[1]) or keep_blank_values:
107 name = nv[0].replace('+', ' ')
108 name = _unquote(name, encoding=encoding, errors=errors)
109 name = _coerce_result(name)
110 value = nv[1].replace('+', ' ')
111 value = _unquote(value, encoding=encoding, errors=errors)
112 value = _coerce_result(value)
113 r.append((name, value))
116 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
117 encoding='utf-8', errors='replace'):
119 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
120 encoding=encoding, errors=errors)
121 for name, value in pairs:
122 if name in parsed_result:
123 parsed_result[name].append(value)
125 parsed_result[name] = [value]
129 compat_str = unicode # Python 2
134 compat_chr = unichr # Python 2
139 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
140 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
141 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
142 'Accept-Encoding': 'gzip, deflate',
143 'Accept-Language': 'en-us,en;q=0.5',
145 def preferredencoding():
146 """Get preferred encoding.
148 Returns the best encoding scheme for the system, based on
149 locale.getpreferredencoding() and some further tweaks.
152 pref = locale.getpreferredencoding()
159 if sys.version_info < (3,0):
161 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
164 assert type(s) == type(u'')
167 def htmlentity_transform(matchobj):
168 """Transforms an HTML entity to a character.
170 This function receives a match object and is intended to be used with
171 the re.sub() function.
173 entity = matchobj.group(1)
175 # Known non-numeric HTML entity
176 if entity in compat_html_entities.name2codepoint:
177 return compat_chr(compat_html_entities.name2codepoint[entity])
179 mobj = re.match(u'(?u)#(x?\\d+)', entity)
181 numstr = mobj.group(1)
182 if numstr.startswith(u'x'):
184 numstr = u'0%s' % numstr
187 return compat_chr(int(numstr, base))
189 # Unknown entity in name, return its literal representation
190 return (u'&%s;' % entity)
192 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
193 class IDParser(compat_html_parser.HTMLParser):
194 """Modified HTMLParser that isolates a tag with the specified id"""
195 def __init__(self, id):
201 self.watch_startpos = False
203 compat_html_parser.HTMLParser.__init__(self)
205 def error(self, message):
206 if self.error_count > 10 or self.started:
207 raise compat_html_parser.HTMLParseError(message, self.getpos())
208 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
209 self.error_count += 1
212 def loads(self, html):
217 def handle_starttag(self, tag, attrs):
220 self.find_startpos(None)
221 if 'id' in attrs and attrs['id'] == self.id:
224 self.watch_startpos = True
226 if not tag in self.depth: self.depth[tag] = 0
229 def handle_endtag(self, tag):
231 if tag in self.depth: self.depth[tag] -= 1
232 if self.depth[self.result[0]] == 0:
234 self.result.append(self.getpos())
236 def find_startpos(self, x):
237 """Needed to put the start position of the result (self.result[1])
238 after the opening tag with the requested id"""
239 if self.watch_startpos:
240 self.watch_startpos = False
241 self.result.append(self.getpos())
242 handle_entityref = handle_charref = handle_data = handle_comment = \
243 handle_decl = handle_pi = unknown_decl = find_startpos
245 def get_result(self):
246 if self.result is None:
248 if len(self.result) != 3:
250 lines = self.html.split('\n')
251 lines = lines[self.result[1][0]-1:self.result[2][0]]
252 lines[0] = lines[0][self.result[1][1]:]
254 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
255 lines[-1] = lines[-1][:self.result[2][1]]
256 return '\n'.join(lines).strip()
258 def get_element_by_id(id, html):
259 """Return the content of the tag with the specified id in the passed HTML document"""
260 parser = IDParser(id)
263 except compat_html_parser.HTMLParseError:
265 return parser.get_result()
268 def clean_html(html):
269 """Clean an HTML snippet into a readable string"""
271 html = html.replace('\n', ' ')
272 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
274 html = re.sub('<.*?>', '', html)
275 # Replace html entities
276 html = unescapeHTML(html)
280 def sanitize_open(filename, open_mode):
281 """Try to open the given filename, and slightly tweak it if this fails.
283 Attempts to open the given filename. If this fails, it tries to change
284 the filename slightly, step by step, until it's either able to open it
285 or it fails and raises a final exception, like the standard open()
288 It returns the tuple (stream, definitive_file_name).
292 if sys.platform == 'win32':
294 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
295 return (sys.stdout, filename)
296 stream = open(encodeFilename(filename), open_mode)
297 return (stream, filename)
298 except (IOError, OSError) as err:
299 # In case of error, try to remove win32 forbidden chars
300 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
302 # An exception here should be caught in the caller
303 stream = open(encodeFilename(filename), open_mode)
304 return (stream, filename)
307 def timeconvert(timestr):
308 """Convert RFC 2822 defined time string into system timestamp"""
310 timetuple = email.utils.parsedate_tz(timestr)
311 if timetuple is not None:
312 timestamp = email.utils.mktime_tz(timetuple)
315 def sanitize_filename(s, restricted=False):
316 """Sanitizes a string so it could be used as part of a filename.
317 If restricted is set, use a stricter subset of allowed characters.
319 def replace_insane(char):
320 if char == '?' or ord(char) < 32 or ord(char) == 127:
323 return '' if restricted else '\''
325 return '_-' if restricted else ' -'
326 elif char in '\\/|*<>':
328 if restricted and (char in '!&\'' or char.isspace()):
330 if restricted and ord(char) > 127:
334 result = u''.join(map(replace_insane, s))
335 while '__' in result:
336 result = result.replace('__', '_')
337 result = result.strip('_')
338 # Common case of "Foreign band name - English song title"
339 if restricted and result.startswith('-_'):
345 def orderedSet(iterable):
346 """ Remove all duplicates from the input iterable """
357 assert type(s) == type(u'')
359 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
362 def encodeFilename(s):
364 @param s The name of the file
367 assert type(s) == type(u'')
369 # Python 3 has a Unicode API
370 if sys.version_info >= (3, 0):
373 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
374 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
375 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
376 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
379 return s.encode(sys.getfilesystemencoding(), 'ignore')
381 class DownloadError(Exception):
382 """Download Error exception.
384 This exception may be thrown by FileDownloader objects if they are not
385 configured to continue on errors. They will contain the appropriate
391 class SameFileError(Exception):
392 """Same File exception.
394 This exception will be thrown by FileDownloader objects if they detect
395 multiple files would have to be downloaded to the same file on disk.
400 class PostProcessingError(Exception):
401 """Post Processing exception.
403 This exception may be raised by PostProcessor's .run() method to
404 indicate an error in the postprocessing task.
408 class MaxDownloadsReached(Exception):
409 """ --max-downloads limit has been reached. """
413 class UnavailableVideoError(Exception):
414 """Unavailable Format exception.
416 This exception will be thrown when a video is requested
417 in a format that is not available for that video.
422 class ContentTooShortError(Exception):
423 """Content Too Short exception.
425 This exception may be raised by FileDownloader objects when a file they
426 download is too small for what the server announced first, indicating
427 the connection was probably interrupted.
433 def __init__(self, downloaded, expected):
434 self.downloaded = downloaded
435 self.expected = expected
438 class Trouble(Exception):
439 """Trouble helper exception
441 This is an exception to be handled with
442 FileDownloader.trouble
445 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
446 """Handler for HTTP requests and responses.
448 This class, when installed with an OpenerDirector, automatically adds
449 the standard headers to every HTTP request and handles gzipped and
450 deflated responses from web servers. If compression is to be avoided in
451 a particular request, the original request in the program code only has
452 to include the HTTP header "Youtubedl-No-Compression", which will be
453 removed before making the real request.
455 Part of this code was copied from:
457 http://techknack.net/python-urllib2-handlers/
459 Andrew Rowls, the author of that code, agreed to release it to the
466 return zlib.decompress(data, -zlib.MAX_WBITS)
468 return zlib.decompress(data)
471 def addinfourl_wrapper(stream, headers, url, code):
472 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
473 return compat_urllib_request.addinfourl(stream, headers, url, code)
474 ret = compat_urllib_request.addinfourl(stream, headers, url)
478 def http_request(self, req):
479 for h in std_headers:
482 req.add_header(h, std_headers[h])
483 if 'Youtubedl-no-compression' in req.headers:
484 if 'Accept-encoding' in req.headers:
485 del req.headers['Accept-encoding']
486 del req.headers['Youtubedl-no-compression']
489 def http_response(self, req, resp):
492 if resp.headers.get('Content-encoding', '') == 'gzip':
493 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
494 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
495 resp.msg = old_resp.msg
497 if resp.headers.get('Content-encoding', '') == 'deflate':
498 gz = io.BytesIO(self.deflate(resp.read()))
499 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
500 resp.msg = old_resp.msg