2 # -*- coding: utf-8 -*-
20 import urllib.request as compat_urllib_request
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_request
25 import urllib.error as compat_urllib_error
26 except ImportError: # Python 2
27 import urllib2 as compat_urllib_error
30 import urllib.parse as compat_urllib_parse
31 except ImportError: # Python 2
32 import urllib as compat_urllib_parse
35 from urllib.parse import urlparse as compat_urllib_parse_urlparse
36 except ImportError: # Python 2
37 from urlparse import urlparse as compat_urllib_parse_urlparse
40 import urllib.parse as compat_urlparse
41 except ImportError: # Python 2
42 import urlparse as compat_urlparse
45 import http.cookiejar as compat_cookiejar
46 except ImportError: # Python 2
47 import cookielib as compat_cookiejar
50 import html.entities as compat_html_entities
51 except ImportError: # Python 2
52 import htmlentitydefs as compat_html_entities
55 import html.parser as compat_html_parser
56 except ImportError: # Python 2
57 import HTMLParser as compat_html_parser
60 import http.client as compat_http_client
61 except ImportError: # Python 2
62 import httplib as compat_http_client
65 from urllib.error import HTTPError as compat_HTTPError
66 except ImportError: # Python 2
67 from urllib2 import HTTPError as compat_HTTPError
70 from subprocess import DEVNULL
71 compat_subprocess_get_DEVNULL = lambda: DEVNULL
73 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
76 from urllib.parse import parse_qs as compat_parse_qs
77 except ImportError: # Python 2
78 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
79 # Python 2's version is apparently totally broken
80 def _unquote(string, encoding='utf-8', errors='replace'):
83 res = string.split('%')
90 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
97 pct_sequence += item[:2].decode('hex')
100 # This segment was just a single percent-encoded character.
101 # May be part of a sequence of code units, so delay decoding.
102 # (Stored in pct_sequence).
106 # Encountered non-percent-encoded characters. Flush the current
108 string += pct_sequence.decode(encoding, errors) + rest
111 # Flush the final pct_sequence
112 string += pct_sequence.decode(encoding, errors)
115 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
116 encoding='utf-8', errors='replace'):
117 qs, _coerce_result = qs, unicode
118 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
120 for name_value in pairs:
121 if not name_value and not strict_parsing:
123 nv = name_value.split('=', 1)
126 raise ValueError("bad query field: %r" % (name_value,))
127 # Handle case of a control-name with no equal sign
128 if keep_blank_values:
132 if len(nv[1]) or keep_blank_values:
133 name = nv[0].replace('+', ' ')
134 name = _unquote(name, encoding=encoding, errors=errors)
135 name = _coerce_result(name)
136 value = nv[1].replace('+', ' ')
137 value = _unquote(value, encoding=encoding, errors=errors)
138 value = _coerce_result(value)
139 r.append((name, value))
142 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
143 encoding='utf-8', errors='replace'):
145 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
146 encoding=encoding, errors=errors)
147 for name, value in pairs:
148 if name in parsed_result:
149 parsed_result[name].append(value)
151 parsed_result[name] = [value]
155 compat_str = unicode # Python 2
160 compat_chr = unichr # Python 2
165 if type(c) is int: return c
168 # This is not clearly defined otherwise
169 compiled_regex_type = type(re.compile(''))
172 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
173 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
174 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
175 'Accept-Encoding': 'gzip, deflate',
176 'Accept-Language': 'en-us,en;q=0.5',
179 def preferredencoding():
180 """Get preferred encoding.
182 Returns the best encoding scheme for the system, based on
183 locale.getpreferredencoding() and some further tweaks.
186 pref = locale.getpreferredencoding()
193 if sys.version_info < (3,0):
195 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
198 assert type(s) == type(u'')
201 # In Python 2.x, json.dump expects a bytestream.
202 # In Python 3.x, it writes to a character stream
203 if sys.version_info < (3,0):
204 def write_json_file(obj, fn):
205 with open(fn, 'wb') as f:
208 def write_json_file(obj, fn):
209 with open(fn, 'w', encoding='utf-8') as f:
212 if sys.version_info >= (2,7):
213 def find_xpath_attr(node, xpath, key, val):
214 """ Find the xpath xpath[@key=val] """
215 assert re.match(r'^[a-zA-Z]+$', key)
216 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
217 expr = xpath + u"[@%s='%s']" % (key, val)
218 return node.find(expr)
220 def find_xpath_attr(node, xpath, key, val):
221 for f in node.findall(xpath):
222 if f.attrib.get(key) == val:
226 def htmlentity_transform(matchobj):
227 """Transforms an HTML entity to a character.
229 This function receives a match object and is intended to be used with
230 the re.sub() function.
232 entity = matchobj.group(1)
234 # Known non-numeric HTML entity
235 if entity in compat_html_entities.name2codepoint:
236 return compat_chr(compat_html_entities.name2codepoint[entity])
238 mobj = re.match(u'(?u)#(x?\\d+)', entity)
240 numstr = mobj.group(1)
241 if numstr.startswith(u'x'):
243 numstr = u'0%s' % numstr
246 return compat_chr(int(numstr, base))
248 # Unknown entity in name, return its literal representation
249 return (u'&%s;' % entity)
251 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
252 class BaseHTMLParser(compat_html_parser.HTMLParser):
254 compat_html_parser.HTMLParser.__init__(self)
257 def loads(self, html):
262 class AttrParser(BaseHTMLParser):
263 """Modified HTMLParser that isolates a tag with the specified attribute"""
264 def __init__(self, attribute, value):
265 self.attribute = attribute
270 self.watch_startpos = False
272 BaseHTMLParser.__init__(self)
274 def error(self, message):
275 if self.error_count > 10 or self.started:
276 raise compat_html_parser.HTMLParseError(message, self.getpos())
277 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
278 self.error_count += 1
281 def handle_starttag(self, tag, attrs):
284 self.find_startpos(None)
285 if self.attribute in attrs and attrs[self.attribute] == self.value:
288 self.watch_startpos = True
290 if not tag in self.depth: self.depth[tag] = 0
293 def handle_endtag(self, tag):
295 if tag in self.depth: self.depth[tag] -= 1
296 if self.depth[self.result[0]] == 0:
298 self.result.append(self.getpos())
300 def find_startpos(self, x):
301 """Needed to put the start position of the result (self.result[1])
302 after the opening tag with the requested id"""
303 if self.watch_startpos:
304 self.watch_startpos = False
305 self.result.append(self.getpos())
306 handle_entityref = handle_charref = handle_data = handle_comment = \
307 handle_decl = handle_pi = unknown_decl = find_startpos
309 def get_result(self):
310 if self.result is None:
312 if len(self.result) != 3:
314 lines = self.html.split('\n')
315 lines = lines[self.result[1][0]-1:self.result[2][0]]
316 lines[0] = lines[0][self.result[1][1]:]
318 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
319 lines[-1] = lines[-1][:self.result[2][1]]
320 return '\n'.join(lines).strip()
321 # Hack for https://github.com/rg3/youtube-dl/issues/662
322 if sys.version_info < (2, 7, 3):
323 AttrParser.parse_endtag = (lambda self, i:
324 i + len("</scr'+'ipt>")
325 if self.rawdata[i:].startswith("</scr'+'ipt>")
326 else compat_html_parser.HTMLParser.parse_endtag(self, i))
328 def get_element_by_id(id, html):
329 """Return the content of the tag with the specified ID in the passed HTML document"""
330 return get_element_by_attribute("id", id, html)
332 def get_element_by_attribute(attribute, value, html):
333 """Return the content of the tag with the specified attribute in the passed HTML document"""
334 parser = AttrParser(attribute, value)
337 except compat_html_parser.HTMLParseError:
339 return parser.get_result()
341 class MetaParser(BaseHTMLParser):
343 Modified HTMLParser that isolates a meta tag with the specified name
346 def __init__(self, name):
347 BaseHTMLParser.__init__(self)
352 def handle_starttag(self, tag, attrs):
356 if attrs.get('name') == self.name:
357 self.result = attrs.get('content')
359 def get_result(self):
362 def get_meta_content(name, html):
364 Return the content attribute from the meta tag with the given name attribute.
366 parser = MetaParser(name)
369 except compat_html_parser.HTMLParseError:
371 return parser.get_result()
374 def clean_html(html):
375 """Clean an HTML snippet into a readable string"""
377 html = html.replace('\n', ' ')
378 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
379 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
381 html = re.sub('<.*?>', '', html)
382 # Replace html entities
383 html = unescapeHTML(html)
387 def sanitize_open(filename, open_mode):
388 """Try to open the given filename, and slightly tweak it if this fails.
390 Attempts to open the given filename. If this fails, it tries to change
391 the filename slightly, step by step, until it's either able to open it
392 or it fails and raises a final exception, like the standard open()
395 It returns the tuple (stream, definitive_file_name).
399 if sys.platform == 'win32':
401 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
402 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
403 stream = open(encodeFilename(filename), open_mode)
404 return (stream, filename)
405 except (IOError, OSError) as err:
406 if err.errno in (errno.EACCES,):
409 # In case of error, try to remove win32 forbidden chars
410 alt_filename = os.path.join(
411 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
412 for path_part in os.path.split(filename)
414 if alt_filename == filename:
417 # An exception here should be caught in the caller
418 stream = open(encodeFilename(filename), open_mode)
419 return (stream, alt_filename)
422 def timeconvert(timestr):
423 """Convert RFC 2822 defined time string into system timestamp"""
425 timetuple = email.utils.parsedate_tz(timestr)
426 if timetuple is not None:
427 timestamp = email.utils.mktime_tz(timetuple)
430 def sanitize_filename(s, restricted=False, is_id=False):
431 """Sanitizes a string so it could be used as part of a filename.
432 If restricted is set, use a stricter subset of allowed characters.
433 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
435 def replace_insane(char):
436 if char == '?' or ord(char) < 32 or ord(char) == 127:
439 return '' if restricted else '\''
441 return '_-' if restricted else ' -'
442 elif char in '\\/|*<>':
444 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
446 if restricted and ord(char) > 127:
450 result = u''.join(map(replace_insane, s))
452 while '__' in result:
453 result = result.replace('__', '_')
454 result = result.strip('_')
455 # Common case of "Foreign band name - English song title"
456 if restricted and result.startswith('-_'):
462 def orderedSet(iterable):
463 """ Remove all duplicates from the input iterable """
474 assert type(s) == type(u'')
476 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
479 def encodeFilename(s):
481 @param s The name of the file
484 assert type(s) == type(u'')
486 # Python 3 has a Unicode API
487 if sys.version_info >= (3, 0):
490 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
491 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
492 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
493 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
496 encoding = sys.getfilesystemencoding()
499 return s.encode(encoding, 'ignore')
501 def decodeOption(optval):
504 if isinstance(optval, bytes):
505 optval = optval.decode(preferredencoding())
507 assert isinstance(optval, compat_str)
510 def formatSeconds(secs):
512 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
514 return '%d:%02d' % (secs // 60, secs % 60)
518 def make_HTTPS_handler(opts):
519 if sys.version_info < (3,2):
520 # Python's 2.x handler is very simplistic
521 return compat_urllib_request.HTTPSHandler()
524 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
525 context.set_default_verify_paths()
527 context.verify_mode = (ssl.CERT_NONE
528 if opts.no_check_certificate
529 else ssl.CERT_REQUIRED)
530 return compat_urllib_request.HTTPSHandler(context=context)
532 class ExtractorError(Exception):
533 """Error during info extraction."""
534 def __init__(self, msg, tb=None, expected=False, cause=None):
535 """ tb, if given, is the original traceback (so that it can be printed out).
536 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
539 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
542 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
543 super(ExtractorError, self).__init__(msg)
546 self.exc_info = sys.exc_info() # preserve original exception
549 def format_traceback(self):
550 if self.traceback is None:
552 return u''.join(traceback.format_tb(self.traceback))
555 class DownloadError(Exception):
556 """Download Error exception.
558 This exception may be thrown by FileDownloader objects if they are not
559 configured to continue on errors. They will contain the appropriate
562 def __init__(self, msg, exc_info=None):
563 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
564 super(DownloadError, self).__init__(msg)
565 self.exc_info = exc_info
568 class SameFileError(Exception):
569 """Same File exception.
571 This exception will be thrown by FileDownloader objects if they detect
572 multiple files would have to be downloaded to the same file on disk.
577 class PostProcessingError(Exception):
578 """Post Processing exception.
580 This exception may be raised by PostProcessor's .run() method to
581 indicate an error in the postprocessing task.
583 def __init__(self, msg):
586 class MaxDownloadsReached(Exception):
587 """ --max-downloads limit has been reached. """
591 class UnavailableVideoError(Exception):
592 """Unavailable Format exception.
594 This exception will be thrown when a video is requested
595 in a format that is not available for that video.
600 class ContentTooShortError(Exception):
601 """Content Too Short exception.
603 This exception may be raised by FileDownloader objects when a file they
604 download is too small for what the server announced first, indicating
605 the connection was probably interrupted.
611 def __init__(self, downloaded, expected):
612 self.downloaded = downloaded
613 self.expected = expected
615 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
616 """Handler for HTTP requests and responses.
618 This class, when installed with an OpenerDirector, automatically adds
619 the standard headers to every HTTP request and handles gzipped and
620 deflated responses from web servers. If compression is to be avoided in
621 a particular request, the original request in the program code only has
622 to include the HTTP header "Youtubedl-No-Compression", which will be
623 removed before making the real request.
625 Part of this code was copied from:
627 http://techknack.net/python-urllib2-handlers/
629 Andrew Rowls, the author of that code, agreed to release it to the
636 return zlib.decompress(data, -zlib.MAX_WBITS)
638 return zlib.decompress(data)
641 def addinfourl_wrapper(stream, headers, url, code):
642 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
643 return compat_urllib_request.addinfourl(stream, headers, url, code)
644 ret = compat_urllib_request.addinfourl(stream, headers, url)
648 def http_request(self, req):
649 for h,v in std_headers.items():
653 if 'Youtubedl-no-compression' in req.headers:
654 if 'Accept-encoding' in req.headers:
655 del req.headers['Accept-encoding']
656 del req.headers['Youtubedl-no-compression']
657 if 'Youtubedl-user-agent' in req.headers:
658 if 'User-agent' in req.headers:
659 del req.headers['User-agent']
660 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
661 del req.headers['Youtubedl-user-agent']
664 def http_response(self, req, resp):
667 if resp.headers.get('Content-encoding', '') == 'gzip':
668 content = resp.read()
669 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
671 uncompressed = io.BytesIO(gz.read())
672 except IOError as original_ioerror:
673 # There may be junk add the end of the file
674 # See http://stackoverflow.com/q/4928560/35070 for details
675 for i in range(1, 1024):
677 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
678 uncompressed = io.BytesIO(gz.read())
683 raise original_ioerror
684 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
685 resp.msg = old_resp.msg
687 if resp.headers.get('Content-encoding', '') == 'deflate':
688 gz = io.BytesIO(self.deflate(resp.read()))
689 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
690 resp.msg = old_resp.msg
693 https_request = http_request
694 https_response = http_response
696 def unified_strdate(date_str):
697 """Return a string with the date in the format YYYYMMDD"""
700 date_str = date_str.replace(',',' ')
701 # %z (UTC offset) is only supported in python>=3.2
702 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
703 format_expressions = [
711 '%Y-%m-%dT%H:%M:%SZ',
713 for expression in format_expressions:
715 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
720 def determine_ext(url, default_ext=u'unknown_video'):
721 guess = url.partition(u'?')[0].rpartition(u'.')[2]
722 if re.match(r'^[A-Za-z0-9]+$', guess):
727 def subtitles_filename(filename, sub_lang, sub_format):
728 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
730 def date_from_str(date_str):
732 Return a datetime object from a string in the format YYYYMMDD or
733 (now|today)[+-][0-9](day|week|month|year)(s)?"""
734 today = datetime.date.today()
735 if date_str == 'now'or date_str == 'today':
737 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
738 if match is not None:
739 sign = match.group('sign')
740 time = int(match.group('time'))
743 unit = match.group('unit')
752 delta = datetime.timedelta(**{unit: time})
754 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
756 class DateRange(object):
757 """Represents a time interval between two dates"""
758 def __init__(self, start=None, end=None):
759 """start and end must be strings in the format accepted by date"""
760 if start is not None:
761 self.start = date_from_str(start)
763 self.start = datetime.datetime.min.date()
765 self.end = date_from_str(end)
767 self.end = datetime.datetime.max.date()
768 if self.start > self.end:
769 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
772 """Returns a range that only contains the given day"""
774 def __contains__(self, date):
775 """Check if the date is in the range"""
776 if not isinstance(date, datetime.date):
777 date = date_from_str(date)
778 return self.start <= date <= self.end
780 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
784 """ Returns the platform name as a compat_str """
785 res = platform.platform()
786 if isinstance(res, bytes):
787 res = res.decode(preferredencoding())
789 assert isinstance(res, compat_str)
793 def write_string(s, out=None):
796 assert type(s) == type(u'')
798 if ('b' in getattr(out, 'mode', '') or
799 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
800 s = s.encode(preferredencoding(), 'ignore')
805 def bytes_to_intlist(bs):
808 if isinstance(bs[0], int): # Python 3
811 return [ord(c) for c in bs]
814 def intlist_to_bytes(xs):
817 if isinstance(chr(0), bytes): # Python 2
818 return ''.join([chr(x) for x in xs])