2 # -*- coding: utf-8 -*-
19 import urllib.request as compat_urllib_request
20 except ImportError: # Python 2
21 import urllib2 as compat_urllib_request
24 import urllib.error as compat_urllib_error
25 except ImportError: # Python 2
26 import urllib2 as compat_urllib_error
29 import urllib.parse as compat_urllib_parse
30 except ImportError: # Python 2
31 import urllib as compat_urllib_parse
34 from urllib.parse import urlparse as compat_urllib_parse_urlparse
35 except ImportError: # Python 2
36 from urlparse import urlparse as compat_urllib_parse_urlparse
39 import http.cookiejar as compat_cookiejar
40 except ImportError: # Python 2
41 import cookielib as compat_cookiejar
44 import html.entities as compat_html_entities
45 except ImportError: # Python 2
46 import htmlentitydefs as compat_html_entities
49 import html.parser as compat_html_parser
50 except ImportError: # Python 2
51 import HTMLParser as compat_html_parser
54 import http.client as compat_http_client
55 except ImportError: # Python 2
56 import httplib as compat_http_client
59 from subprocess import DEVNULL
60 compat_subprocess_get_DEVNULL = lambda: DEVNULL
62 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
65 from urllib.parse import parse_qs as compat_parse_qs
66 except ImportError: # Python 2
67 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
68 # Python 2's version is apparently totally broken
69 def _unquote(string, encoding='utf-8', errors='replace'):
72 res = string.split('%')
79 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
86 pct_sequence += item[:2].decode('hex')
89 # This segment was just a single percent-encoded character.
90 # May be part of a sequence of code units, so delay decoding.
91 # (Stored in pct_sequence).
95 # Encountered non-percent-encoded characters. Flush the current
97 string += pct_sequence.decode(encoding, errors) + rest
100 # Flush the final pct_sequence
101 string += pct_sequence.decode(encoding, errors)
104 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
105 encoding='utf-8', errors='replace'):
106 qs, _coerce_result = qs, unicode
107 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
109 for name_value in pairs:
110 if not name_value and not strict_parsing:
112 nv = name_value.split('=', 1)
115 raise ValueError("bad query field: %r" % (name_value,))
116 # Handle case of a control-name with no equal sign
117 if keep_blank_values:
121 if len(nv[1]) or keep_blank_values:
122 name = nv[0].replace('+', ' ')
123 name = _unquote(name, encoding=encoding, errors=errors)
124 name = _coerce_result(name)
125 value = nv[1].replace('+', ' ')
126 value = _unquote(value, encoding=encoding, errors=errors)
127 value = _coerce_result(value)
128 r.append((name, value))
131 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
134 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
135 encoding=encoding, errors=errors)
136 for name, value in pairs:
137 if name in parsed_result:
138 parsed_result[name].append(value)
140 parsed_result[name] = [value]
144 compat_str = unicode # Python 2
149 compat_chr = unichr # Python 2
154 if type(c) is int: return c
157 # This is not clearly defined otherwise
158 compiled_regex_type = type(re.compile(''))
161 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
162 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
163 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
164 'Accept-Encoding': 'gzip, deflate',
165 'Accept-Language': 'en-us,en;q=0.5',
168 def preferredencoding():
169 """Get preferred encoding.
171 Returns the best encoding scheme for the system, based on
172 locale.getpreferredencoding() and some further tweaks.
175 pref = locale.getpreferredencoding()
182 if sys.version_info < (3,0):
184 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
187 assert type(s) == type(u'')
190 # In Python 2.x, json.dump expects a bytestream.
191 # In Python 3.x, it writes to a character stream
192 if sys.version_info < (3,0):
193 def write_json_file(obj, fn):
194 with open(fn, 'wb') as f:
197 def write_json_file(obj, fn):
198 with open(fn, 'w', encoding='utf-8') as f:
201 if sys.version_info >= (2,7):
202 def find_xpath_attr(node, xpath, key, val):
203 """ Find the xpath xpath[@key=val] """
204 assert re.match(r'^[a-z]+$', key)
205 assert re.match(r'^[a-z]*$', val)
206 expr = xpath + u"[@%s='%s']" % (key, val)
207 return node.find(expr)
209 def find_xpath_attr(node, xpath, key, val):
210 for f in node.findall(xpath):
211 if f.attrib.get(key) == val:
215 def htmlentity_transform(matchobj):
216 """Transforms an HTML entity to a character.
218 This function receives a match object and is intended to be used with
219 the re.sub() function.
221 entity = matchobj.group(1)
223 # Known non-numeric HTML entity
224 if entity in compat_html_entities.name2codepoint:
225 return compat_chr(compat_html_entities.name2codepoint[entity])
227 mobj = re.match(u'(?u)#(x?\\d+)', entity)
229 numstr = mobj.group(1)
230 if numstr.startswith(u'x'):
232 numstr = u'0%s' % numstr
235 return compat_chr(int(numstr, base))
237 # Unknown entity in name, return its literal representation
238 return (u'&%s;' % entity)
240 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
241 class AttrParser(compat_html_parser.HTMLParser):
242 """Modified HTMLParser that isolates a tag with the specified attribute"""
243 def __init__(self, attribute, value):
244 self.attribute = attribute
250 self.watch_startpos = False
252 compat_html_parser.HTMLParser.__init__(self)
254 def error(self, message):
255 if self.error_count > 10 or self.started:
256 raise compat_html_parser.HTMLParseError(message, self.getpos())
257 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
258 self.error_count += 1
261 def loads(self, html):
266 def handle_starttag(self, tag, attrs):
269 self.find_startpos(None)
270 if self.attribute in attrs and attrs[self.attribute] == self.value:
273 self.watch_startpos = True
275 if not tag in self.depth: self.depth[tag] = 0
278 def handle_endtag(self, tag):
280 if tag in self.depth: self.depth[tag] -= 1
281 if self.depth[self.result[0]] == 0:
283 self.result.append(self.getpos())
285 def find_startpos(self, x):
286 """Needed to put the start position of the result (self.result[1])
287 after the opening tag with the requested id"""
288 if self.watch_startpos:
289 self.watch_startpos = False
290 self.result.append(self.getpos())
291 handle_entityref = handle_charref = handle_data = handle_comment = \
292 handle_decl = handle_pi = unknown_decl = find_startpos
294 def get_result(self):
295 if self.result is None:
297 if len(self.result) != 3:
299 lines = self.html.split('\n')
300 lines = lines[self.result[1][0]-1:self.result[2][0]]
301 lines[0] = lines[0][self.result[1][1]:]
303 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
304 lines[-1] = lines[-1][:self.result[2][1]]
305 return '\n'.join(lines).strip()
306 # Hack for https://github.com/rg3/youtube-dl/issues/662
307 if sys.version_info < (2, 7, 3):
308 AttrParser.parse_endtag = (lambda self, i:
309 i + len("</scr'+'ipt>")
310 if self.rawdata[i:].startswith("</scr'+'ipt>")
311 else compat_html_parser.HTMLParser.parse_endtag(self, i))
313 def get_element_by_id(id, html):
314 """Return the content of the tag with the specified ID in the passed HTML document"""
315 return get_element_by_attribute("id", id, html)
317 def get_element_by_attribute(attribute, value, html):
318 """Return the content of the tag with the specified attribute in the passed HTML document"""
319 parser = AttrParser(attribute, value)
322 except compat_html_parser.HTMLParseError:
324 return parser.get_result()
327 def clean_html(html):
328 """Clean an HTML snippet into a readable string"""
330 html = html.replace('\n', ' ')
331 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
332 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
334 html = re.sub('<.*?>', '', html)
335 # Replace html entities
336 html = unescapeHTML(html)
340 def sanitize_open(filename, open_mode):
341 """Try to open the given filename, and slightly tweak it if this fails.
343 Attempts to open the given filename. If this fails, it tries to change
344 the filename slightly, step by step, until it's either able to open it
345 or it fails and raises a final exception, like the standard open()
348 It returns the tuple (stream, definitive_file_name).
352 if sys.platform == 'win32':
354 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
355 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
356 stream = open(encodeFilename(filename), open_mode)
357 return (stream, filename)
358 except (IOError, OSError) as err:
359 if err.errno in (errno.EACCES,):
362 # In case of error, try to remove win32 forbidden chars
363 alt_filename = os.path.join(
364 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
365 for path_part in os.path.split(filename)
367 if alt_filename == filename:
370 # An exception here should be caught in the caller
371 stream = open(encodeFilename(filename), open_mode)
372 return (stream, alt_filename)
375 def timeconvert(timestr):
376 """Convert RFC 2822 defined time string into system timestamp"""
378 timetuple = email.utils.parsedate_tz(timestr)
379 if timetuple is not None:
380 timestamp = email.utils.mktime_tz(timetuple)
383 def sanitize_filename(s, restricted=False, is_id=False):
384 """Sanitizes a string so it could be used as part of a filename.
385 If restricted is set, use a stricter subset of allowed characters.
386 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
388 def replace_insane(char):
389 if char == '?' or ord(char) < 32 or ord(char) == 127:
392 return '' if restricted else '\''
394 return '_-' if restricted else ' -'
395 elif char in '\\/|*<>':
397 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
399 if restricted and ord(char) > 127:
403 result = u''.join(map(replace_insane, s))
405 while '__' in result:
406 result = result.replace('__', '_')
407 result = result.strip('_')
408 # Common case of "Foreign band name - English song title"
409 if restricted and result.startswith('-_'):
415 def orderedSet(iterable):
416 """ Remove all duplicates from the input iterable """
427 assert type(s) == type(u'')
429 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
432 def encodeFilename(s):
434 @param s The name of the file
437 assert type(s) == type(u'')
439 # Python 3 has a Unicode API
440 if sys.version_info >= (3, 0):
443 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
444 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
445 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
446 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
449 encoding = sys.getfilesystemencoding()
452 return s.encode(encoding, 'ignore')
454 def decodeOption(optval):
457 if isinstance(optval, bytes):
458 optval = optval.decode(preferredencoding())
460 assert isinstance(optval, compat_str)
463 def formatSeconds(secs):
465 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
467 return '%d:%02d' % (secs // 60, secs % 60)
471 def make_HTTPS_handler(opts):
472 if sys.version_info < (3,2):
473 # Python's 2.x handler is very simplistic
474 return compat_urllib_request.HTTPSHandler()
477 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
478 context.set_default_verify_paths()
480 context.verify_mode = (ssl.CERT_NONE
481 if opts.no_check_certificate
482 else ssl.CERT_REQUIRED)
483 return compat_urllib_request.HTTPSHandler(context=context)
485 class ExtractorError(Exception):
486 """Error during info extraction."""
487 def __init__(self, msg, tb=None, expected=False):
488 """ tb, if given, is the original traceback (so that it can be printed out).
489 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output.'
496 super(ExtractorError, self).__init__(msg)
499 self.exc_info = sys.exc_info() # preserve original exception
501 def format_traceback(self):
502 if self.traceback is None:
504 return u''.join(traceback.format_tb(self.traceback))
507 class DownloadError(Exception):
508 """Download Error exception.
510 This exception may be thrown by FileDownloader objects if they are not
511 configured to continue on errors. They will contain the appropriate
514 def __init__(self, msg, exc_info=None):
515 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
516 super(DownloadError, self).__init__(msg)
517 self.exc_info = exc_info
520 class SameFileError(Exception):
521 """Same File exception.
523 This exception will be thrown by FileDownloader objects if they detect
524 multiple files would have to be downloaded to the same file on disk.
529 class PostProcessingError(Exception):
530 """Post Processing exception.
532 This exception may be raised by PostProcessor's .run() method to
533 indicate an error in the postprocessing task.
535 def __init__(self, msg):
538 class MaxDownloadsReached(Exception):
539 """ --max-downloads limit has been reached. """
543 class UnavailableVideoError(Exception):
544 """Unavailable Format exception.
546 This exception will be thrown when a video is requested
547 in a format that is not available for that video.
552 class ContentTooShortError(Exception):
553 """Content Too Short exception.
555 This exception may be raised by FileDownloader objects when a file they
556 download is too small for what the server announced first, indicating
557 the connection was probably interrupted.
563 def __init__(self, downloaded, expected):
564 self.downloaded = downloaded
565 self.expected = expected
567 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
568 """Handler for HTTP requests and responses.
570 This class, when installed with an OpenerDirector, automatically adds
571 the standard headers to every HTTP request and handles gzipped and
572 deflated responses from web servers. If compression is to be avoided in
573 a particular request, the original request in the program code only has
574 to include the HTTP header "Youtubedl-No-Compression", which will be
575 removed before making the real request.
577 Part of this code was copied from:
579 http://techknack.net/python-urllib2-handlers/
581 Andrew Rowls, the author of that code, agreed to release it to the
588 return zlib.decompress(data, -zlib.MAX_WBITS)
590 return zlib.decompress(data)
593 def addinfourl_wrapper(stream, headers, url, code):
594 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
595 return compat_urllib_request.addinfourl(stream, headers, url, code)
596 ret = compat_urllib_request.addinfourl(stream, headers, url)
600 def http_request(self, req):
601 for h,v in std_headers.items():
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
609 if 'Youtubedl-user-agent' in req.headers:
610 if 'User-agent' in req.headers:
611 del req.headers['User-agent']
612 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
613 del req.headers['Youtubedl-user-agent']
616 def http_response(self, req, resp):
619 if resp.headers.get('Content-encoding', '') == 'gzip':
620 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
621 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
622 resp.msg = old_resp.msg
624 if resp.headers.get('Content-encoding', '') == 'deflate':
625 gz = io.BytesIO(self.deflate(resp.read()))
626 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
627 resp.msg = old_resp.msg
630 https_request = http_request
631 https_response = http_response
633 def unified_strdate(date_str):
634 """Return a string with the date in the format YYYYMMDD"""
637 date_str = date_str.replace(',',' ')
638 # %z (UTC offset) is only supported in python>=3.2
639 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
640 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
641 for expression in format_expressions:
643 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
648 def determine_ext(url):
649 guess = url.partition(u'?')[0].rpartition(u'.')[2]
650 if re.match(r'^[A-Za-z0-9]+$', guess):
653 return u'unknown_video'
655 def date_from_str(date_str):
657 Return a datetime object from a string in the format YYYYMMDD or
658 (now|today)[+-][0-9](day|week|month|year)(s)?"""
659 today = datetime.date.today()
660 if date_str == 'now'or date_str == 'today':
662 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
663 if match is not None:
664 sign = match.group('sign')
665 time = int(match.group('time'))
668 unit = match.group('unit')
677 delta = datetime.timedelta(**{unit: time})
679 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
681 class DateRange(object):
682 """Represents a time interval between two dates"""
683 def __init__(self, start=None, end=None):
684 """start and end must be strings in the format accepted by date"""
685 if start is not None:
686 self.start = date_from_str(start)
688 self.start = datetime.datetime.min.date()
690 self.end = date_from_str(end)
692 self.end = datetime.datetime.max.date()
693 if self.start > self.end:
694 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
697 """Returns a range that only contains the given day"""
699 def __contains__(self, date):
700 """Check if the date is in the range"""
701 if not isinstance(date, datetime.date):
702 date = date_from_str(date)
703 return self.start <= date <= self.end
705 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())