2 # -*- coding: utf-8 -*-
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
39 import urllib.parse as compat_urlparse
40 except ImportError: # Python 2
41 import urlparse as compat_urlparse
44 import http.cookiejar as compat_cookiejar
45 except ImportError: # Python 2
46 import cookielib as compat_cookiejar
49 import html.entities as compat_html_entities
50 except ImportError: # Python 2
51 import htmlentitydefs as compat_html_entities
54 import html.parser as compat_html_parser
55 except ImportError: # Python 2
56 import HTMLParser as compat_html_parser
59 import http.client as compat_http_client
60 except ImportError: # Python 2
61 import httplib as compat_http_client
64 from urllib.error import HTTPError as compat_HTTPError
65 except ImportError: # Python 2
66 from urllib2 import HTTPError as compat_HTTPError
69 from subprocess import DEVNULL
70 compat_subprocess_get_DEVNULL = lambda: DEVNULL
72 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
75 from urllib.parse import parse_qs as compat_parse_qs
76 except ImportError: # Python 2
77 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
78 # Python 2's version is apparently totally broken
79 def _unquote(string, encoding='utf-8', errors='replace'):
82 res = string.split('%')
89 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
96 pct_sequence += item[:2].decode('hex')
99 # This segment was just a single percent-encoded character.
100 # May be part of a sequence of code units, so delay decoding.
101 # (Stored in pct_sequence).
105 # Encountered non-percent-encoded characters. Flush the current
107 string += pct_sequence.decode(encoding, errors) + rest
110 # Flush the final pct_sequence
111 string += pct_sequence.decode(encoding, errors)
114 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
115 encoding='utf-8', errors='replace'):
116 qs, _coerce_result = qs, unicode
117 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
119 for name_value in pairs:
120 if not name_value and not strict_parsing:
122 nv = name_value.split('=', 1)
125 raise ValueError("bad query field: %r" % (name_value,))
126 # Handle case of a control-name with no equal sign
127 if keep_blank_values:
131 if len(nv[1]) or keep_blank_values:
132 name = nv[0].replace('+', ' ')
133 name = _unquote(name, encoding=encoding, errors=errors)
134 name = _coerce_result(name)
135 value = nv[1].replace('+', ' ')
136 value = _unquote(value, encoding=encoding, errors=errors)
137 value = _coerce_result(value)
138 r.append((name, value))
141 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
142 encoding='utf-8', errors='replace'):
144 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
145 encoding=encoding, errors=errors)
146 for name, value in pairs:
147 if name in parsed_result:
148 parsed_result[name].append(value)
150 parsed_result[name] = [value]
154 compat_str = unicode # Python 2
159 compat_chr = unichr # Python 2
164 if type(c) is int: return c
167 # This is not clearly defined otherwise
168 compiled_regex_type = type(re.compile(''))
171 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
172 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
173 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
174 'Accept-Encoding': 'gzip, deflate',
175 'Accept-Language': 'en-us,en;q=0.5',
178 def preferredencoding():
179 """Get preferred encoding.
181 Returns the best encoding scheme for the system, based on
182 locale.getpreferredencoding() and some further tweaks.
185 pref = locale.getpreferredencoding()
192 if sys.version_info < (3,0):
194 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
197 assert type(s) == type(u'')
200 # In Python 2.x, json.dump expects a bytestream.
201 # In Python 3.x, it writes to a character stream
202 if sys.version_info < (3,0):
203 def write_json_file(obj, fn):
204 with open(fn, 'wb') as f:
207 def write_json_file(obj, fn):
208 with open(fn, 'w', encoding='utf-8') as f:
211 if sys.version_info >= (2,7):
212 def find_xpath_attr(node, xpath, key, val):
213 """ Find the xpath xpath[@key=val] """
214 assert re.match(r'^[a-zA-Z]+$', key)
215 assert re.match(r'^[a-zA-Z@\s]*$', val)
216 expr = xpath + u"[@%s='%s']" % (key, val)
217 return node.find(expr)
219 def find_xpath_attr(node, xpath, key, val):
220 for f in node.findall(xpath):
221 if f.attrib.get(key) == val:
225 def htmlentity_transform(matchobj):
226 """Transforms an HTML entity to a character.
228 This function receives a match object and is intended to be used with
229 the re.sub() function.
231 entity = matchobj.group(1)
233 # Known non-numeric HTML entity
234 if entity in compat_html_entities.name2codepoint:
235 return compat_chr(compat_html_entities.name2codepoint[entity])
237 mobj = re.match(u'(?u)#(x?\\d+)', entity)
239 numstr = mobj.group(1)
240 if numstr.startswith(u'x'):
242 numstr = u'0%s' % numstr
245 return compat_chr(int(numstr, base))
247 # Unknown entity in name, return its literal representation
248 return (u'&%s;' % entity)
250 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
251 class AttrParser(compat_html_parser.HTMLParser):
252 """Modified HTMLParser that isolates a tag with the specified attribute"""
253 def __init__(self, attribute, value):
254 self.attribute = attribute
260 self.watch_startpos = False
262 compat_html_parser.HTMLParser.__init__(self)
264 def error(self, message):
265 if self.error_count > 10 or self.started:
266 raise compat_html_parser.HTMLParseError(message, self.getpos())
267 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
268 self.error_count += 1
271 def loads(self, html):
276 def handle_starttag(self, tag, attrs):
279 self.find_startpos(None)
280 if self.attribute in attrs and attrs[self.attribute] == self.value:
283 self.watch_startpos = True
285 if not tag in self.depth: self.depth[tag] = 0
288 def handle_endtag(self, tag):
290 if tag in self.depth: self.depth[tag] -= 1
291 if self.depth[self.result[0]] == 0:
293 self.result.append(self.getpos())
295 def find_startpos(self, x):
296 """Needed to put the start position of the result (self.result[1])
297 after the opening tag with the requested id"""
298 if self.watch_startpos:
299 self.watch_startpos = False
300 self.result.append(self.getpos())
301 handle_entityref = handle_charref = handle_data = handle_comment = \
302 handle_decl = handle_pi = unknown_decl = find_startpos
304 def get_result(self):
305 if self.result is None:
307 if len(self.result) != 3:
309 lines = self.html.split('\n')
310 lines = lines[self.result[1][0]-1:self.result[2][0]]
311 lines[0] = lines[0][self.result[1][1]:]
313 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
314 lines[-1] = lines[-1][:self.result[2][1]]
315 return '\n'.join(lines).strip()
316 # Hack for https://github.com/rg3/youtube-dl/issues/662
317 if sys.version_info < (2, 7, 3):
318 AttrParser.parse_endtag = (lambda self, i:
319 i + len("</scr'+'ipt>")
320 if self.rawdata[i:].startswith("</scr'+'ipt>")
321 else compat_html_parser.HTMLParser.parse_endtag(self, i))
323 def get_element_by_id(id, html):
324 """Return the content of the tag with the specified ID in the passed HTML document"""
325 return get_element_by_attribute("id", id, html)
327 def get_element_by_attribute(attribute, value, html):
328 """Return the content of the tag with the specified attribute in the passed HTML document"""
329 parser = AttrParser(attribute, value)
332 except compat_html_parser.HTMLParseError:
334 return parser.get_result()
337 def clean_html(html):
338 """Clean an HTML snippet into a readable string"""
340 html = html.replace('\n', ' ')
341 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
342 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
344 html = re.sub('<.*?>', '', html)
345 # Replace html entities
346 html = unescapeHTML(html)
350 def sanitize_open(filename, open_mode):
351 """Try to open the given filename, and slightly tweak it if this fails.
353 Attempts to open the given filename. If this fails, it tries to change
354 the filename slightly, step by step, until it's either able to open it
355 or it fails and raises a final exception, like the standard open()
358 It returns the tuple (stream, definitive_file_name).
362 if sys.platform == 'win32':
364 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
365 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
366 stream = open(encodeFilename(filename), open_mode)
367 return (stream, filename)
368 except (IOError, OSError) as err:
369 if err.errno in (errno.EACCES,):
372 # In case of error, try to remove win32 forbidden chars
373 alt_filename = os.path.join(
374 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
375 for path_part in os.path.split(filename)
377 if alt_filename == filename:
380 # An exception here should be caught in the caller
381 stream = open(encodeFilename(filename), open_mode)
382 return (stream, alt_filename)
385 def timeconvert(timestr):
386 """Convert RFC 2822 defined time string into system timestamp"""
388 timetuple = email.utils.parsedate_tz(timestr)
389 if timetuple is not None:
390 timestamp = email.utils.mktime_tz(timetuple)
393 def sanitize_filename(s, restricted=False, is_id=False):
394 """Sanitizes a string so it could be used as part of a filename.
395 If restricted is set, use a stricter subset of allowed characters.
396 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
398 def replace_insane(char):
399 if char == '?' or ord(char) < 32 or ord(char) == 127:
402 return '' if restricted else '\''
404 return '_-' if restricted else ' -'
405 elif char in '\\/|*<>':
407 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
409 if restricted and ord(char) > 127:
413 result = u''.join(map(replace_insane, s))
415 while '__' in result:
416 result = result.replace('__', '_')
417 result = result.strip('_')
418 # Common case of "Foreign band name - English song title"
419 if restricted and result.startswith('-_'):
425 def orderedSet(iterable):
426 """ Remove all duplicates from the input iterable """
437 assert type(s) == type(u'')
439 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
442 def encodeFilename(s):
444 @param s The name of the file
447 assert type(s) == type(u'')
449 # Python 3 has a Unicode API
450 if sys.version_info >= (3, 0):
453 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
454 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
455 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
456 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
459 encoding = sys.getfilesystemencoding()
462 return s.encode(encoding, 'ignore')
464 def decodeOption(optval):
467 if isinstance(optval, bytes):
468 optval = optval.decode(preferredencoding())
470 assert isinstance(optval, compat_str)
473 def formatSeconds(secs):
475 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
477 return '%d:%02d' % (secs // 60, secs % 60)
481 def make_HTTPS_handler(opts):
482 if sys.version_info < (3,2):
483 # Python's 2.x handler is very simplistic
484 return compat_urllib_request.HTTPSHandler()
487 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
488 context.set_default_verify_paths()
490 context.verify_mode = (ssl.CERT_NONE
491 if opts.no_check_certificate
492 else ssl.CERT_REQUIRED)
493 return compat_urllib_request.HTTPSHandler(context=context)
495 class ExtractorError(Exception):
496 """Error during info extraction."""
497 def __init__(self, msg, tb=None, expected=False, cause=None):
498 """ tb, if given, is the original traceback (so that it can be printed out).
499 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
502 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
505 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
506 super(ExtractorError, self).__init__(msg)
509 self.exc_info = sys.exc_info() # preserve original exception
512 def format_traceback(self):
513 if self.traceback is None:
515 return u''.join(traceback.format_tb(self.traceback))
518 class DownloadError(Exception):
519 """Download Error exception.
521 This exception may be thrown by FileDownloader objects if they are not
522 configured to continue on errors. They will contain the appropriate
525 def __init__(self, msg, exc_info=None):
526 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
527 super(DownloadError, self).__init__(msg)
528 self.exc_info = exc_info
531 class SameFileError(Exception):
532 """Same File exception.
534 This exception will be thrown by FileDownloader objects if they detect
535 multiple files would have to be downloaded to the same file on disk.
540 class PostProcessingError(Exception):
541 """Post Processing exception.
543 This exception may be raised by PostProcessor's .run() method to
544 indicate an error in the postprocessing task.
546 def __init__(self, msg):
549 class MaxDownloadsReached(Exception):
550 """ --max-downloads limit has been reached. """
554 class UnavailableVideoError(Exception):
555 """Unavailable Format exception.
557 This exception will be thrown when a video is requested
558 in a format that is not available for that video.
563 class ContentTooShortError(Exception):
564 """Content Too Short exception.
566 This exception may be raised by FileDownloader objects when a file they
567 download is too small for what the server announced first, indicating
568 the connection was probably interrupted.
574 def __init__(self, downloaded, expected):
575 self.downloaded = downloaded
576 self.expected = expected
578 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
579 """Handler for HTTP requests and responses.
581 This class, when installed with an OpenerDirector, automatically adds
582 the standard headers to every HTTP request and handles gzipped and
583 deflated responses from web servers. If compression is to be avoided in
584 a particular request, the original request in the program code only has
585 to include the HTTP header "Youtubedl-No-Compression", which will be
586 removed before making the real request.
588 Part of this code was copied from:
590 http://techknack.net/python-urllib2-handlers/
592 Andrew Rowls, the author of that code, agreed to release it to the
599 return zlib.decompress(data, -zlib.MAX_WBITS)
601 return zlib.decompress(data)
604 def addinfourl_wrapper(stream, headers, url, code):
605 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
606 return compat_urllib_request.addinfourl(stream, headers, url, code)
607 ret = compat_urllib_request.addinfourl(stream, headers, url)
611 def http_request(self, req):
612 for h,v in std_headers.items():
616 if 'Youtubedl-no-compression' in req.headers:
617 if 'Accept-encoding' in req.headers:
618 del req.headers['Accept-encoding']
619 del req.headers['Youtubedl-no-compression']
620 if 'Youtubedl-user-agent' in req.headers:
621 if 'User-agent' in req.headers:
622 del req.headers['User-agent']
623 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
624 del req.headers['Youtubedl-user-agent']
627 def http_response(self, req, resp):
630 if resp.headers.get('Content-encoding', '') == 'gzip':
631 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
632 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
633 resp.msg = old_resp.msg
635 if resp.headers.get('Content-encoding', '') == 'deflate':
636 gz = io.BytesIO(self.deflate(resp.read()))
637 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
638 resp.msg = old_resp.msg
641 https_request = http_request
642 https_response = http_response
644 def unified_strdate(date_str):
645 """Return a string with the date in the format YYYYMMDD"""
648 date_str = date_str.replace(',',' ')
649 # %z (UTC offset) is only supported in python>=3.2
650 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
651 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
652 for expression in format_expressions:
654 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
659 def determine_ext(url, default_ext=u'unknown_video'):
660 guess = url.partition(u'?')[0].rpartition(u'.')[2]
661 if re.match(r'^[A-Za-z0-9]+$', guess):
666 def subtitles_filename(filename, sub_lang, sub_format):
667 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
669 def date_from_str(date_str):
671 Return a datetime object from a string in the format YYYYMMDD or
672 (now|today)[+-][0-9](day|week|month|year)(s)?"""
673 today = datetime.date.today()
674 if date_str == 'now'or date_str == 'today':
676 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
677 if match is not None:
678 sign = match.group('sign')
679 time = int(match.group('time'))
682 unit = match.group('unit')
691 delta = datetime.timedelta(**{unit: time})
693 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
695 class DateRange(object):
696 """Represents a time interval between two dates"""
697 def __init__(self, start=None, end=None):
698 """start and end must be strings in the format accepted by date"""
699 if start is not None:
700 self.start = date_from_str(start)
702 self.start = datetime.datetime.min.date()
704 self.end = date_from_str(end)
706 self.end = datetime.datetime.max.date()
707 if self.start > self.end:
708 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
711 """Returns a range that only contains the given day"""
713 def __contains__(self, date):
714 """Check if the date is in the range"""
715 if not isinstance(date, datetime.date):
716 date = date_from_str(date)
717 return self.start <= date <= self.end
719 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())