2 # -*- coding: utf-8 -*-
16 import urllib.request as compat_urllib_request
17 except ImportError: # Python 2
18 import urllib2 as compat_urllib_request
21 import urllib.error as compat_urllib_error
22 except ImportError: # Python 2
23 import urllib2 as compat_urllib_error
26 import urllib.parse as compat_urllib_parse
27 except ImportError: # Python 2
28 import urllib as compat_urllib_parse
31 from urllib.parse import urlparse as compat_urllib_parse_urlparse
32 except ImportError: # Python 2
33 from urlparse import urlparse as compat_urllib_parse_urlparse
36 import http.cookiejar as compat_cookiejar
37 except ImportError: # Python 2
38 import cookielib as compat_cookiejar
41 import html.entities as compat_html_entities
42 except ImportError: # Python 2
43 import htmlentitydefs as compat_html_entities
46 import html.parser as compat_html_parser
47 except ImportError: # Python 2
48 import HTMLParser as compat_html_parser
51 import http.client as compat_http_client
52 except ImportError: # Python 2
53 import httplib as compat_http_client
56 from subprocess import DEVNULL
57 compat_subprocess_get_DEVNULL = lambda: DEVNULL
59 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
62 from urllib.parse import parse_qs as compat_parse_qs
63 except ImportError: # Python 2
64 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
65 # Python 2's version is apparently totally broken
66 def _unquote(string, encoding='utf-8', errors='replace'):
69 res = string.split('%')
76 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
83 pct_sequence += item[:2].decode('hex')
86 # This segment was just a single percent-encoded character.
87 # May be part of a sequence of code units, so delay decoding.
88 # (Stored in pct_sequence).
92 # Encountered non-percent-encoded characters. Flush the current
94 string += pct_sequence.decode(encoding, errors) + rest
97 # Flush the final pct_sequence
98 string += pct_sequence.decode(encoding, errors)
101 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
102 encoding='utf-8', errors='replace'):
103 qs, _coerce_result = qs, unicode
104 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
106 for name_value in pairs:
107 if not name_value and not strict_parsing:
109 nv = name_value.split('=', 1)
112 raise ValueError("bad query field: %r" % (name_value,))
113 # Handle case of a control-name with no equal sign
114 if keep_blank_values:
118 if len(nv[1]) or keep_blank_values:
119 name = nv[0].replace('+', ' ')
120 name = _unquote(name, encoding=encoding, errors=errors)
121 name = _coerce_result(name)
122 value = nv[1].replace('+', ' ')
123 value = _unquote(value, encoding=encoding, errors=errors)
124 value = _coerce_result(value)
125 r.append((name, value))
128 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
129 encoding='utf-8', errors='replace'):
131 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
132 encoding=encoding, errors=errors)
133 for name, value in pairs:
134 if name in parsed_result:
135 parsed_result[name].append(value)
137 parsed_result[name] = [value]
141 compat_str = unicode # Python 2
146 compat_chr = unichr # Python 2
151 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
152 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
153 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
154 'Accept-Encoding': 'gzip, deflate',
155 'Accept-Language': 'en-us,en;q=0.5',
157 def preferredencoding():
158 """Get preferred encoding.
160 Returns the best encoding scheme for the system, based on
161 locale.getpreferredencoding() and some further tweaks.
164 pref = locale.getpreferredencoding()
171 if sys.version_info < (3,0):
173 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
176 assert type(s) == type(u'')
179 # In Python 2.x, json.dump expects a bytestream.
180 # In Python 3.x, it writes to a character stream
181 if sys.version_info < (3,0):
182 def write_json_file(obj, fn):
183 with open(fn, 'wb') as f:
186 def write_json_file(obj, fn):
187 with open(fn, 'w', encoding='utf-8') as f:
191 def htmlentity_transform(matchobj):
192 """Transforms an HTML entity to a character.
194 This function receives a match object and is intended to be used with
195 the re.sub() function.
197 entity = matchobj.group(1)
199 # Known non-numeric HTML entity
200 if entity in compat_html_entities.name2codepoint:
201 return compat_chr(compat_html_entities.name2codepoint[entity])
203 mobj = re.match(u'(?u)#(x?\\d+)', entity)
205 numstr = mobj.group(1)
206 if numstr.startswith(u'x'):
208 numstr = u'0%s' % numstr
211 return compat_chr(int(numstr, base))
213 # Unknown entity in name, return its literal representation
214 return (u'&%s;' % entity)
216 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
217 class IDParser(compat_html_parser.HTMLParser):
218 """Modified HTMLParser that isolates a tag with the specified id"""
219 def __init__(self, id):
225 self.watch_startpos = False
227 compat_html_parser.HTMLParser.__init__(self)
229 def error(self, message):
230 if self.error_count > 10 or self.started:
231 raise compat_html_parser.HTMLParseError(message, self.getpos())
232 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
233 self.error_count += 1
236 def loads(self, html):
241 def handle_starttag(self, tag, attrs):
244 self.find_startpos(None)
245 if 'id' in attrs and attrs['id'] == self.id:
248 self.watch_startpos = True
250 if not tag in self.depth: self.depth[tag] = 0
253 def handle_endtag(self, tag):
255 if tag in self.depth: self.depth[tag] -= 1
256 if self.depth[self.result[0]] == 0:
258 self.result.append(self.getpos())
260 def find_startpos(self, x):
261 """Needed to put the start position of the result (self.result[1])
262 after the opening tag with the requested id"""
263 if self.watch_startpos:
264 self.watch_startpos = False
265 self.result.append(self.getpos())
266 handle_entityref = handle_charref = handle_data = handle_comment = \
267 handle_decl = handle_pi = unknown_decl = find_startpos
269 def get_result(self):
270 if self.result is None:
272 if len(self.result) != 3:
274 lines = self.html.split('\n')
275 lines = lines[self.result[1][0]-1:self.result[2][0]]
276 lines[0] = lines[0][self.result[1][1]:]
278 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
279 lines[-1] = lines[-1][:self.result[2][1]]
280 return '\n'.join(lines).strip()
282 def get_element_by_id(id, html):
283 """Return the content of the tag with the specified id in the passed HTML document"""
284 parser = IDParser(id)
287 except compat_html_parser.HTMLParseError:
289 return parser.get_result()
292 def clean_html(html):
293 """Clean an HTML snippet into a readable string"""
295 html = html.replace('\n', ' ')
296 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
298 html = re.sub('<.*?>', '', html)
299 # Replace html entities
300 html = unescapeHTML(html)
304 def sanitize_open(filename, open_mode):
305 """Try to open the given filename, and slightly tweak it if this fails.
307 Attempts to open the given filename. If this fails, it tries to change
308 the filename slightly, step by step, until it's either able to open it
309 or it fails and raises a final exception, like the standard open()
312 It returns the tuple (stream, definitive_file_name).
316 if sys.platform == 'win32':
318 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
319 return (sys.stdout, filename)
320 stream = open(encodeFilename(filename), open_mode)
321 return (stream, filename)
322 except (IOError, OSError) as err:
323 # In case of error, try to remove win32 forbidden chars
324 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
326 # An exception here should be caught in the caller
327 stream = open(encodeFilename(filename), open_mode)
328 return (stream, filename)
331 def timeconvert(timestr):
332 """Convert RFC 2822 defined time string into system timestamp"""
334 timetuple = email.utils.parsedate_tz(timestr)
335 if timetuple is not None:
336 timestamp = email.utils.mktime_tz(timetuple)
339 def sanitize_filename(s, restricted=False, is_id=False):
340 """Sanitizes a string so it could be used as part of a filename.
341 If restricted is set, use a stricter subset of allowed characters.
342 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
344 def replace_insane(char):
345 if char == '?' or ord(char) < 32 or ord(char) == 127:
348 return '' if restricted else '\''
350 return '_-' if restricted else ' -'
351 elif char in '\\/|*<>':
353 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
355 if restricted and ord(char) > 127:
359 result = u''.join(map(replace_insane, s))
361 while '__' in result:
362 result = result.replace('__', '_')
363 result = result.strip('_')
364 # Common case of "Foreign band name - English song title"
365 if restricted and result.startswith('-_'):
371 def orderedSet(iterable):
372 """ Remove all duplicates from the input iterable """
383 assert type(s) == type(u'')
385 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
388 def encodeFilename(s):
390 @param s The name of the file
393 assert type(s) == type(u'')
395 # Python 3 has a Unicode API
396 if sys.version_info >= (3, 0):
399 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
400 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
401 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
402 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
405 return s.encode(sys.getfilesystemencoding(), 'ignore')
407 class DownloadError(Exception):
408 """Download Error exception.
410 This exception may be thrown by FileDownloader objects if they are not
411 configured to continue on errors. They will contain the appropriate
417 class SameFileError(Exception):
418 """Same File exception.
420 This exception will be thrown by FileDownloader objects if they detect
421 multiple files would have to be downloaded to the same file on disk.
426 class PostProcessingError(Exception):
427 """Post Processing exception.
429 This exception may be raised by PostProcessor's .run() method to
430 indicate an error in the postprocessing task.
434 class MaxDownloadsReached(Exception):
435 """ --max-downloads limit has been reached. """
439 class UnavailableVideoError(Exception):
440 """Unavailable Format exception.
442 This exception will be thrown when a video is requested
443 in a format that is not available for that video.
448 class ContentTooShortError(Exception):
449 """Content Too Short exception.
451 This exception may be raised by FileDownloader objects when a file they
452 download is too small for what the server announced first, indicating
453 the connection was probably interrupted.
459 def __init__(self, downloaded, expected):
460 self.downloaded = downloaded
461 self.expected = expected
464 class Trouble(Exception):
465 """Trouble helper exception
467 This is an exception to be handled with
468 FileDownloader.trouble
471 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
472 """Handler for HTTP requests and responses.
474 This class, when installed with an OpenerDirector, automatically adds
475 the standard headers to every HTTP request and handles gzipped and
476 deflated responses from web servers. If compression is to be avoided in
477 a particular request, the original request in the program code only has
478 to include the HTTP header "Youtubedl-No-Compression", which will be
479 removed before making the real request.
481 Part of this code was copied from:
483 http://techknack.net/python-urllib2-handlers/
485 Andrew Rowls, the author of that code, agreed to release it to the
492 return zlib.decompress(data, -zlib.MAX_WBITS)
494 return zlib.decompress(data)
497 def addinfourl_wrapper(stream, headers, url, code):
498 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
499 return compat_urllib_request.addinfourl(stream, headers, url, code)
500 ret = compat_urllib_request.addinfourl(stream, headers, url)
504 def http_request(self, req):
505 for h in std_headers:
508 req.add_header(h, std_headers[h])
509 if 'Youtubedl-no-compression' in req.headers:
510 if 'Accept-encoding' in req.headers:
511 del req.headers['Accept-encoding']
512 del req.headers['Youtubedl-no-compression']
515 def http_response(self, req, resp):
518 if resp.headers.get('Content-encoding', '') == 'gzip':
519 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
520 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
521 resp.msg = old_resp.msg
523 if resp.headers.get('Content-encoding', '') == 'deflate':
524 gz = io.BytesIO(self.deflate(resp.read()))
525 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
526 resp.msg = old_resp.msg
529 https_request = http_request
530 https_response = http_response