2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
207 if sys.version_info >= (3, 0):
208 compat_getenv = os.getenv
209 compat_expanduser = os.path.expanduser
211 # Environment variables should be decoded with filesystem encoding.
212 # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
214 def compat_getenv(key, default=None):
215 env = os.getenv(key, default)
217 env = env.decode(get_filesystem_encoding())
220 # HACK: The default implementations of os.path.expanduser from cpython do not decode
221 # environment variables with filesystem encoding. We will work around this by
222 # providing adjusted implementations.
223 # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
224 # for different platforms with correct environment variables decoding.
226 if os.name == 'posix':
227 def compat_expanduser(path):
228 """Expand ~ and ~user constructions. If user or $HOME is unknown,
230 if not path.startswith('~'):
232 i = path.find('/', 1)
236 if 'HOME' not in os.environ:
238 userhome = pwd.getpwuid(os.getuid()).pw_dir
240 userhome = compat_getenv('HOME')
244 pwent = pwd.getpwnam(path[1:i])
247 userhome = pwent.pw_dir
248 userhome = userhome.rstrip('/')
249 return (userhome + path[i:]) or '/'
250 elif os.name == 'nt' or os.name == 'ce':
251 def compat_expanduser(path):
252 """Expand ~ and ~user constructs.
254 If user or $HOME is unknown, do nothing."""
258 while i < n and path[i] not in '/\\':
261 if 'HOME' in os.environ:
262 userhome = compat_getenv('HOME')
263 elif 'USERPROFILE' in os.environ:
264 userhome = compat_getenv('USERPROFILE')
265 elif not 'HOMEPATH' in os.environ:
269 drive = compat_getenv('HOMEDRIVE')
272 userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
275 userhome = os.path.join(os.path.dirname(userhome), path[1:i])
277 return userhome + path[i:]
279 compat_expanduser = os.path.expanduser
282 # This is not clearly defined otherwise
283 compiled_regex_type = type(re.compile(''))
286 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
287 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
288 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
289 'Accept-Encoding': 'gzip, deflate',
290 'Accept-Language': 'en-us,en;q=0.5',
293 def preferredencoding():
294 """Get preferred encoding.
296 Returns the best encoding scheme for the system, based on
297 locale.getpreferredencoding() and some further tweaks.
300 pref = locale.getpreferredencoding()
307 if sys.version_info < (3,0):
309 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
312 assert type(s) == type(u'')
316 def write_json_file(obj, fn):
317 """ Encode obj as JSON and write it to fn, atomically """
321 'prefix': os.path.basename(fn) + '.',
322 'dir': os.path.dirname(fn),
326 # In Python 2.x, json.dump expects a bytestream.
327 # In Python 3.x, it writes to a character stream
328 if sys.version_info < (3, 0):
336 tf = tempfile.NamedTemporaryFile(**args)
341 os.rename(tf.name, fn)
350 if sys.version_info >= (2, 7):
351 def find_xpath_attr(node, xpath, key, val):
352 """ Find the xpath xpath[@key=val] """
353 assert re.match(r'^[a-zA-Z-]+$', key)
354 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
355 expr = xpath + u"[@%s='%s']" % (key, val)
356 return node.find(expr)
358 def find_xpath_attr(node, xpath, key, val):
359 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
360 # .//node does not match if a node is a direct child of . !
361 if isinstance(xpath, unicode):
362 xpath = xpath.encode('ascii')
364 for f in node.findall(xpath):
365 if f.attrib.get(key) == val:
369 # On python2.6 the xml.etree.ElementTree.Element methods don't support
370 # the namespace parameter
371 def xpath_with_ns(path, ns_map):
372 components = [c.split(':') for c in path.split('/')]
376 replaced.append(c[0])
379 replaced.append('{%s}%s' % (ns_map[ns], tag))
380 return '/'.join(replaced)
383 def xpath_text(node, xpath, name=None, fatal=False):
384 if sys.version_info < (2, 7): # Crazy 2.6
385 xpath = xpath.encode('ascii')
390 name = xpath if name is None else name
391 raise ExtractorError('Could not find XML element %s' % name)
397 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
398 class BaseHTMLParser(compat_html_parser.HTMLParser):
400 compat_html_parser.HTMLParser.__init__(self)
403 def loads(self, html):
408 class AttrParser(BaseHTMLParser):
409 """Modified HTMLParser that isolates a tag with the specified attribute"""
410 def __init__(self, attribute, value):
411 self.attribute = attribute
416 self.watch_startpos = False
418 BaseHTMLParser.__init__(self)
420 def error(self, message):
421 if self.error_count > 10 or self.started:
422 raise compat_html_parser.HTMLParseError(message, self.getpos())
423 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
424 self.error_count += 1
427 def handle_starttag(self, tag, attrs):
430 self.find_startpos(None)
431 if self.attribute in attrs and attrs[self.attribute] == self.value:
434 self.watch_startpos = True
436 if not tag in self.depth: self.depth[tag] = 0
439 def handle_endtag(self, tag):
441 if tag in self.depth: self.depth[tag] -= 1
442 if self.depth[self.result[0]] == 0:
444 self.result.append(self.getpos())
446 def find_startpos(self, x):
447 """Needed to put the start position of the result (self.result[1])
448 after the opening tag with the requested id"""
449 if self.watch_startpos:
450 self.watch_startpos = False
451 self.result.append(self.getpos())
452 handle_entityref = handle_charref = handle_data = handle_comment = \
453 handle_decl = handle_pi = unknown_decl = find_startpos
455 def get_result(self):
456 if self.result is None:
458 if len(self.result) != 3:
460 lines = self.html.split('\n')
461 lines = lines[self.result[1][0]-1:self.result[2][0]]
462 lines[0] = lines[0][self.result[1][1]:]
464 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
465 lines[-1] = lines[-1][:self.result[2][1]]
466 return '\n'.join(lines).strip()
467 # Hack for https://github.com/rg3/youtube-dl/issues/662
468 if sys.version_info < (2, 7, 3):
469 AttrParser.parse_endtag = (lambda self, i:
470 i + len("</scr'+'ipt>")
471 if self.rawdata[i:].startswith("</scr'+'ipt>")
472 else compat_html_parser.HTMLParser.parse_endtag(self, i))
474 def get_element_by_id(id, html):
475 """Return the content of the tag with the specified ID in the passed HTML document"""
476 return get_element_by_attribute("id", id, html)
478 def get_element_by_attribute(attribute, value, html):
479 """Return the content of the tag with the specified attribute in the passed HTML document"""
480 parser = AttrParser(attribute, value)
483 except compat_html_parser.HTMLParseError:
485 return parser.get_result()
487 class MetaParser(BaseHTMLParser):
489 Modified HTMLParser that isolates a meta tag with the specified name
492 def __init__(self, name):
493 BaseHTMLParser.__init__(self)
498 def handle_starttag(self, tag, attrs):
502 if attrs.get('name') == self.name:
503 self.result = attrs.get('content')
505 def get_result(self):
508 def get_meta_content(name, html):
510 Return the content attribute from the meta tag with the given name attribute.
512 parser = MetaParser(name)
515 except compat_html_parser.HTMLParseError:
517 return parser.get_result()
520 def clean_html(html):
521 """Clean an HTML snippet into a readable string"""
523 html = html.replace('\n', ' ')
524 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
525 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
527 html = re.sub('<.*?>', '', html)
528 # Replace html entities
529 html = unescapeHTML(html)
533 def sanitize_open(filename, open_mode):
534 """Try to open the given filename, and slightly tweak it if this fails.
536 Attempts to open the given filename. If this fails, it tries to change
537 the filename slightly, step by step, until it's either able to open it
538 or it fails and raises a final exception, like the standard open()
541 It returns the tuple (stream, definitive_file_name).
545 if sys.platform == 'win32':
547 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
548 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
549 stream = open(encodeFilename(filename), open_mode)
550 return (stream, filename)
551 except (IOError, OSError) as err:
552 if err.errno in (errno.EACCES,):
555 # In case of error, try to remove win32 forbidden chars
556 alt_filename = os.path.join(
557 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
558 for path_part in os.path.split(filename)
560 if alt_filename == filename:
563 # An exception here should be caught in the caller
564 stream = open(encodeFilename(filename), open_mode)
565 return (stream, alt_filename)
568 def timeconvert(timestr):
569 """Convert RFC 2822 defined time string into system timestamp"""
571 timetuple = email.utils.parsedate_tz(timestr)
572 if timetuple is not None:
573 timestamp = email.utils.mktime_tz(timetuple)
576 def sanitize_filename(s, restricted=False, is_id=False):
577 """Sanitizes a string so it could be used as part of a filename.
578 If restricted is set, use a stricter subset of allowed characters.
579 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
581 def replace_insane(char):
582 if char == '?' or ord(char) < 32 or ord(char) == 127:
585 return '' if restricted else '\''
587 return '_-' if restricted else ' -'
588 elif char in '\\/|*<>':
590 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
592 if restricted and ord(char) > 127:
596 result = u''.join(map(replace_insane, s))
598 while '__' in result:
599 result = result.replace('__', '_')
600 result = result.strip('_')
601 # Common case of "Foreign band name - English song title"
602 if restricted and result.startswith('-_'):
608 def orderedSet(iterable):
609 """ Remove all duplicates from the input iterable """
617 def _htmlentity_transform(entity):
618 """Transforms an HTML entity to a character."""
619 # Known non-numeric HTML entity
620 if entity in compat_html_entities.name2codepoint:
621 return compat_chr(compat_html_entities.name2codepoint[entity])
623 mobj = re.match(r'#(x?[0-9]+)', entity)
625 numstr = mobj.group(1)
626 if numstr.startswith(u'x'):
628 numstr = u'0%s' % numstr
631 return compat_chr(int(numstr, base))
633 # Unknown entity in name, return its literal representation
634 return (u'&%s;' % entity)
640 assert type(s) == compat_str
643 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
646 def encodeFilename(s, for_subprocess=False):
648 @param s The name of the file
651 assert type(s) == compat_str
653 # Python 3 has a Unicode API
654 if sys.version_info >= (3, 0):
657 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
658 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
659 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
660 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
661 if not for_subprocess:
664 # For subprocess calls, encode with locale encoding
665 # Refer to http://stackoverflow.com/a/9951851/35070
666 encoding = preferredencoding()
668 encoding = sys.getfilesystemencoding()
671 return s.encode(encoding, 'ignore')
674 def encodeArgument(s):
675 if not isinstance(s, compat_str):
676 # Legacy code that uses byte strings
677 # Uncomment the following line after fixing all post processors
678 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
679 s = s.decode('ascii')
680 return encodeFilename(s, True)
683 def decodeOption(optval):
686 if isinstance(optval, bytes):
687 optval = optval.decode(preferredencoding())
689 assert isinstance(optval, compat_str)
692 def formatSeconds(secs):
694 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
696 return '%d:%02d' % (secs // 60, secs % 60)
701 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
702 if sys.version_info < (3, 2):
705 class HTTPSConnectionV3(httplib.HTTPSConnection):
706 def __init__(self, *args, **kwargs):
707 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
710 sock = socket.create_connection((self.host, self.port), self.timeout)
711 if getattr(self, '_tunnel_host', False):
715 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
717 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
719 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
720 def https_open(self, req):
721 return self.do_open(HTTPSConnectionV3, req)
722 return HTTPSHandlerV3(**kwargs)
723 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
724 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
725 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
726 if opts_no_check_certificate:
727 context.verify_mode = ssl.CERT_NONE
728 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
730 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
731 context.verify_mode = (ssl.CERT_NONE
732 if opts_no_check_certificate
733 else ssl.CERT_REQUIRED)
734 context.set_default_verify_paths()
736 context.load_default_certs()
737 except AttributeError:
739 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
741 class ExtractorError(Exception):
742 """Error during info extraction."""
743 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
744 """ tb, if given, is the original traceback (so that it can be printed out).
745 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
748 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
750 if video_id is not None:
751 msg = video_id + ': ' + msg
753 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
754 super(ExtractorError, self).__init__(msg)
757 self.exc_info = sys.exc_info() # preserve original exception
759 self.video_id = video_id
761 def format_traceback(self):
762 if self.traceback is None:
764 return u''.join(traceback.format_tb(self.traceback))
767 class RegexNotFoundError(ExtractorError):
768 """Error when a regex didn't match"""
772 class DownloadError(Exception):
773 """Download Error exception.
775 This exception may be thrown by FileDownloader objects if they are not
776 configured to continue on errors. They will contain the appropriate
779 def __init__(self, msg, exc_info=None):
780 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
781 super(DownloadError, self).__init__(msg)
782 self.exc_info = exc_info
785 class SameFileError(Exception):
786 """Same File exception.
788 This exception will be thrown by FileDownloader objects if they detect
789 multiple files would have to be downloaded to the same file on disk.
794 class PostProcessingError(Exception):
795 """Post Processing exception.
797 This exception may be raised by PostProcessor's .run() method to
798 indicate an error in the postprocessing task.
800 def __init__(self, msg):
803 class MaxDownloadsReached(Exception):
804 """ --max-downloads limit has been reached. """
808 class UnavailableVideoError(Exception):
809 """Unavailable Format exception.
811 This exception will be thrown when a video is requested
812 in a format that is not available for that video.
817 class ContentTooShortError(Exception):
818 """Content Too Short exception.
820 This exception may be raised by FileDownloader objects when a file they
821 download is too small for what the server announced first, indicating
822 the connection was probably interrupted.
828 def __init__(self, downloaded, expected):
829 self.downloaded = downloaded
830 self.expected = expected
832 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
833 """Handler for HTTP requests and responses.
835 This class, when installed with an OpenerDirector, automatically adds
836 the standard headers to every HTTP request and handles gzipped and
837 deflated responses from web servers. If compression is to be avoided in
838 a particular request, the original request in the program code only has
839 to include the HTTP header "Youtubedl-No-Compression", which will be
840 removed before making the real request.
842 Part of this code was copied from:
844 http://techknack.net/python-urllib2-handlers/
846 Andrew Rowls, the author of that code, agreed to release it to the
853 return zlib.decompress(data, -zlib.MAX_WBITS)
855 return zlib.decompress(data)
858 def addinfourl_wrapper(stream, headers, url, code):
859 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
860 return compat_urllib_request.addinfourl(stream, headers, url, code)
861 ret = compat_urllib_request.addinfourl(stream, headers, url)
865 def http_request(self, req):
866 for h, v in std_headers.items():
867 if h not in req.headers:
869 if 'Youtubedl-no-compression' in req.headers:
870 if 'Accept-encoding' in req.headers:
871 del req.headers['Accept-encoding']
872 del req.headers['Youtubedl-no-compression']
873 if 'Youtubedl-user-agent' in req.headers:
874 if 'User-agent' in req.headers:
875 del req.headers['User-agent']
876 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
877 del req.headers['Youtubedl-user-agent']
879 if sys.version_info < (2, 7) and '#' in req.get_full_url():
880 # Python 2.6 is brain-dead when it comes to fragments
881 req._Request__original = req._Request__original.partition('#')[0]
882 req._Request__r_type = req._Request__r_type.partition('#')[0]
886 def http_response(self, req, resp):
889 if resp.headers.get('Content-encoding', '') == 'gzip':
890 content = resp.read()
891 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
893 uncompressed = io.BytesIO(gz.read())
894 except IOError as original_ioerror:
895 # There may be junk add the end of the file
896 # See http://stackoverflow.com/q/4928560/35070 for details
897 for i in range(1, 1024):
899 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
900 uncompressed = io.BytesIO(gz.read())
905 raise original_ioerror
906 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
907 resp.msg = old_resp.msg
909 if resp.headers.get('Content-encoding', '') == 'deflate':
910 gz = io.BytesIO(self.deflate(resp.read()))
911 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
912 resp.msg = old_resp.msg
915 https_request = http_request
916 https_response = http_response
919 def parse_iso8601(date_str, delimiter='T'):
920 """ Return a UNIX timestamp from the given date """
926 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
929 timezone = datetime.timedelta()
931 date_str = date_str[:-len(m.group(0))]
932 if not m.group('sign'):
933 timezone = datetime.timedelta()
935 sign = 1 if m.group('sign') == '+' else -1
936 timezone = datetime.timedelta(
937 hours=sign * int(m.group('hours')),
938 minutes=sign * int(m.group('minutes')))
939 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
940 dt = datetime.datetime.strptime(date_str, date_format) - timezone
941 return calendar.timegm(dt.timetuple())
944 def unified_strdate(date_str):
945 """Return a string with the date in the format YYYYMMDD"""
952 date_str = date_str.replace(',', ' ')
953 # %z (UTC offset) is only supported in python>=3.2
954 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
955 format_expressions = [
960 '%b %dst %Y %I:%M%p',
961 '%b %dnd %Y %I:%M%p',
962 '%b %dth %Y %I:%M%p',
973 '%Y-%m-%dT%H:%M:%SZ',
974 '%Y-%m-%dT%H:%M:%S.%fZ',
975 '%Y-%m-%dT%H:%M:%S.%f0Z',
977 '%Y-%m-%dT%H:%M:%S.%f',
980 for expression in format_expressions:
982 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
985 if upload_date is None:
986 timetuple = email.utils.parsedate_tz(date_str)
988 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
991 def determine_ext(url, default_ext=u'unknown_video'):
994 guess = url.partition(u'?')[0].rpartition(u'.')[2]
995 if re.match(r'^[A-Za-z0-9]+$', guess):
1000 def subtitles_filename(filename, sub_lang, sub_format):
1001 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
1003 def date_from_str(date_str):
1005 Return a datetime object from a string in the format YYYYMMDD or
1006 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1007 today = datetime.date.today()
1008 if date_str == 'now'or date_str == 'today':
1010 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1011 if match is not None:
1012 sign = match.group('sign')
1013 time = int(match.group('time'))
1016 unit = match.group('unit')
1017 #A bad aproximation?
1021 elif unit == 'year':
1025 delta = datetime.timedelta(**{unit: time})
1026 return today + delta
1027 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
1029 def hyphenate_date(date_str):
1031 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1032 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1033 if match is not None:
1034 return '-'.join(match.groups())
1038 class DateRange(object):
1039 """Represents a time interval between two dates"""
1040 def __init__(self, start=None, end=None):
1041 """start and end must be strings in the format accepted by date"""
1042 if start is not None:
1043 self.start = date_from_str(start)
1045 self.start = datetime.datetime.min.date()
1047 self.end = date_from_str(end)
1049 self.end = datetime.datetime.max.date()
1050 if self.start > self.end:
1051 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1054 """Returns a range that only contains the given day"""
1056 def __contains__(self, date):
1057 """Check if the date is in the range"""
1058 if not isinstance(date, datetime.date):
1059 date = date_from_str(date)
1060 return self.start <= date <= self.end
1062 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
1065 def platform_name():
1066 """ Returns the platform name as a compat_str """
1067 res = platform.platform()
1068 if isinstance(res, bytes):
1069 res = res.decode(preferredencoding())
1071 assert isinstance(res, compat_str)
1075 def _windows_write_string(s, out):
1076 """ Returns True if the string was written using special methods,
1077 False if it has yet to be written out."""
1078 # Adapted from http://stackoverflow.com/a/3259271/35070
1081 import ctypes.wintypes
1089 fileno = out.fileno()
1090 except AttributeError:
1091 # If the output stream doesn't have a fileno, it's virtual
1093 if fileno not in WIN_OUTPUT_IDS:
1096 GetStdHandle = ctypes.WINFUNCTYPE(
1097 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1098 ("GetStdHandle", ctypes.windll.kernel32))
1099 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1101 WriteConsoleW = ctypes.WINFUNCTYPE(
1102 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1103 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1104 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1105 written = ctypes.wintypes.DWORD(0)
1107 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1108 FILE_TYPE_CHAR = 0x0002
1109 FILE_TYPE_REMOTE = 0x8000
1110 GetConsoleMode = ctypes.WINFUNCTYPE(
1111 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1112 ctypes.POINTER(ctypes.wintypes.DWORD))(
1113 ("GetConsoleMode", ctypes.windll.kernel32))
1114 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1116 def not_a_console(handle):
1117 if handle == INVALID_HANDLE_VALUE or handle is None:
1119 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1120 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1122 if not_a_console(h):
1125 def next_nonbmp_pos(s):
1127 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1128 except StopIteration:
1132 count = min(next_nonbmp_pos(s), 1024)
1134 ret = WriteConsoleW(
1135 h, s, count if count else 2, ctypes.byref(written), None)
1137 raise OSError('Failed to write string')
1138 if not count: # We just wrote a non-BMP character
1139 assert written.value == 2
1142 assert written.value > 0
1143 s = s[written.value:]
1147 def write_string(s, out=None, encoding=None):
1150 assert type(s) == compat_str
1152 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1153 if _windows_write_string(s, out):
1156 if ('b' in getattr(out, 'mode', '') or
1157 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1158 byt = s.encode(encoding or preferredencoding(), 'ignore')
1160 elif hasattr(out, 'buffer'):
1161 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1162 byt = s.encode(enc, 'ignore')
1163 out.buffer.write(byt)
1169 def bytes_to_intlist(bs):
1172 if isinstance(bs[0], int): # Python 3
1175 return [ord(c) for c in bs]
1178 def intlist_to_bytes(xs):
1181 if isinstance(chr(0), bytes): # Python 2
1182 return ''.join([chr(x) for x in xs])
1187 # Cross-platform file locking
1188 if sys.platform == 'win32':
1189 import ctypes.wintypes
1192 class OVERLAPPED(ctypes.Structure):
1194 ('Internal', ctypes.wintypes.LPVOID),
1195 ('InternalHigh', ctypes.wintypes.LPVOID),
1196 ('Offset', ctypes.wintypes.DWORD),
1197 ('OffsetHigh', ctypes.wintypes.DWORD),
1198 ('hEvent', ctypes.wintypes.HANDLE),
1201 kernel32 = ctypes.windll.kernel32
1202 LockFileEx = kernel32.LockFileEx
1203 LockFileEx.argtypes = [
1204 ctypes.wintypes.HANDLE, # hFile
1205 ctypes.wintypes.DWORD, # dwFlags
1206 ctypes.wintypes.DWORD, # dwReserved
1207 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1208 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1209 ctypes.POINTER(OVERLAPPED) # Overlapped
1211 LockFileEx.restype = ctypes.wintypes.BOOL
1212 UnlockFileEx = kernel32.UnlockFileEx
1213 UnlockFileEx.argtypes = [
1214 ctypes.wintypes.HANDLE, # hFile
1215 ctypes.wintypes.DWORD, # dwReserved
1216 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1217 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1218 ctypes.POINTER(OVERLAPPED) # Overlapped
1220 UnlockFileEx.restype = ctypes.wintypes.BOOL
1221 whole_low = 0xffffffff
1222 whole_high = 0x7fffffff
1224 def _lock_file(f, exclusive):
1225 overlapped = OVERLAPPED()
1226 overlapped.Offset = 0
1227 overlapped.OffsetHigh = 0
1228 overlapped.hEvent = 0
1229 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1230 handle = msvcrt.get_osfhandle(f.fileno())
1231 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1232 whole_low, whole_high, f._lock_file_overlapped_p):
1233 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1235 def _unlock_file(f):
1236 assert f._lock_file_overlapped_p
1237 handle = msvcrt.get_osfhandle(f.fileno())
1238 if not UnlockFileEx(handle, 0,
1239 whole_low, whole_high, f._lock_file_overlapped_p):
1240 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1245 def _lock_file(f, exclusive):
1246 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1248 def _unlock_file(f):
1249 fcntl.flock(f, fcntl.LOCK_UN)
1252 class locked_file(object):
1253 def __init__(self, filename, mode, encoding=None):
1254 assert mode in ['r', 'a', 'w']
1255 self.f = io.open(filename, mode, encoding=encoding)
1258 def __enter__(self):
1259 exclusive = self.mode != 'r'
1261 _lock_file(self.f, exclusive)
1267 def __exit__(self, etype, value, traceback):
1269 _unlock_file(self.f)
1276 def write(self, *args):
1277 return self.f.write(*args)
1279 def read(self, *args):
1280 return self.f.read(*args)
1283 def get_filesystem_encoding():
1284 encoding = sys.getfilesystemencoding()
1285 return encoding if encoding is not None else 'utf-8'
1288 def shell_quote(args):
1290 encoding = get_filesystem_encoding()
1292 if isinstance(a, bytes):
1293 # We may get a filename encoded with 'encodeFilename'
1294 a = a.decode(encoding)
1295 quoted_args.append(pipes.quote(a))
1296 return u' '.join(quoted_args)
1299 def takewhile_inclusive(pred, seq):
1300 """ Like itertools.takewhile, but include the latest evaluated element
1301 (the first element so that Not pred(e)) """
1308 def smuggle_url(url, data):
1309 """ Pass additional data in a URL for internal use. """
1311 sdata = compat_urllib_parse.urlencode(
1312 {u'__youtubedl_smuggle': json.dumps(data)})
1313 return url + u'#' + sdata
1316 def unsmuggle_url(smug_url, default=None):
1317 if not '#__youtubedl_smuggle' in smug_url:
1318 return smug_url, default
1319 url, _, sdata = smug_url.rpartition(u'#')
1320 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1321 data = json.loads(jsond)
1325 def format_bytes(bytes):
1328 if type(bytes) is str:
1329 bytes = float(bytes)
1333 exponent = int(math.log(bytes, 1024.0))
1334 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1335 converted = float(bytes) / float(1024 ** exponent)
1336 return u'%.2f%s' % (converted, suffix)
1339 def get_term_width():
1340 columns = compat_getenv('COLUMNS', None)
1345 sp = subprocess.Popen(
1347 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1348 out, err = sp.communicate()
1349 return int(out.split()[1])
1355 def month_by_name(name):
1356 """ Return the number of a month by (locale-independently) English name """
1359 u'January', u'February', u'March', u'April', u'May', u'June',
1360 u'July', u'August', u'September', u'October', u'November', u'December']
1362 return ENGLISH_NAMES.index(name) + 1
1367 def fix_xml_ampersands(xml_str):
1368 """Replace all the '&' by '&' in XML"""
1370 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1375 def setproctitle(title):
1376 assert isinstance(title, compat_str)
1378 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1381 title_bytes = title.encode('utf-8')
1382 buf = ctypes.create_string_buffer(len(title_bytes))
1383 buf.value = title_bytes
1385 libc.prctl(15, buf, 0, 0, 0)
1386 except AttributeError:
1387 return # Strange libc, just skip this
1390 def remove_start(s, start):
1391 if s.startswith(start):
1392 return s[len(start):]
1396 def remove_end(s, end):
1398 return s[:-len(end)]
1402 def url_basename(url):
1403 path = compat_urlparse.urlparse(url).path
1404 return path.strip(u'/').split(u'/')[-1]
1407 class HEADRequest(compat_urllib_request.Request):
1408 def get_method(self):
1412 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1415 v = getattr(v, get_attr, None)
1418 return default if v is None else (int(v) * invscale // scale)
1421 def str_or_none(v, default=None):
1422 return default if v is None else compat_str(v)
1425 def str_to_int(int_str):
1426 """ A more relaxed version of int_or_none """
1429 int_str = re.sub(r'[,\.\+]', u'', int_str)
1433 def float_or_none(v, scale=1, invscale=1, default=None):
1434 return default if v is None else (float(v) * invscale / scale)
1437 def parse_duration(s):
1444 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1447 res = int(m.group('secs'))
1449 res += int(m.group('mins')) * 60
1450 if m.group('hours'):
1451 res += int(m.group('hours')) * 60 * 60
1453 res += float(m.group('ms'))
1457 def prepend_extension(filename, ext):
1458 name, real_ext = os.path.splitext(filename)
1459 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1462 def check_executable(exe, args=[]):
1463 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1464 args can be a list of arguments for a short output (like -version) """
1466 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1472 class PagedList(object):
1474 # This is only useful for tests
1475 return len(self.getslice())
1478 class OnDemandPagedList(PagedList):
1479 def __init__(self, pagefunc, pagesize):
1480 self._pagefunc = pagefunc
1481 self._pagesize = pagesize
1483 def getslice(self, start=0, end=None):
1485 for pagenum in itertools.count(start // self._pagesize):
1486 firstid = pagenum * self._pagesize
1487 nextfirstid = pagenum * self._pagesize + self._pagesize
1488 if start >= nextfirstid:
1491 page_results = list(self._pagefunc(pagenum))
1494 start % self._pagesize
1495 if firstid <= start < nextfirstid
1499 ((end - 1) % self._pagesize) + 1
1500 if (end is not None and firstid <= end <= nextfirstid)
1503 if startv != 0 or endv is not None:
1504 page_results = page_results[startv:endv]
1505 res.extend(page_results)
1507 # A little optimization - if current page is not "full", ie. does
1508 # not contain page_size videos then we can assume that this page
1509 # is the last one - there are no more ids on further pages -
1510 # i.e. no need to query again.
1511 if len(page_results) + startv < self._pagesize:
1514 # If we got the whole page, but the next page is not interesting,
1515 # break out early as well
1516 if end == nextfirstid:
1521 class InAdvancePagedList(PagedList):
1522 def __init__(self, pagefunc, pagecount, pagesize):
1523 self._pagefunc = pagefunc
1524 self._pagecount = pagecount
1525 self._pagesize = pagesize
1527 def getslice(self, start=0, end=None):
1529 start_page = start // self._pagesize
1531 self._pagecount if end is None else (end // self._pagesize + 1))
1532 skip_elems = start - start_page * self._pagesize
1533 only_more = None if end is None else end - start
1534 for pagenum in range(start_page, end_page):
1535 page = list(self._pagefunc(pagenum))
1537 page = page[skip_elems:]
1539 if only_more is not None:
1540 if len(page) < only_more:
1541 only_more -= len(page)
1543 page = page[:only_more]
1550 def uppercase_escape(s):
1551 unicode_escape = codecs.getdecoder('unicode_escape')
1553 r'\\U[0-9a-fA-F]{8}',
1554 lambda m: unicode_escape(m.group(0))[0],
1558 def escape_rfc3986(s):
1559 """Escape non-ASCII characters as suggested by RFC 3986"""
1560 if sys.version_info < (3, 0) and isinstance(s, unicode):
1561 s = s.encode('utf-8')
1562 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1565 def escape_url(url):
1566 """Escape URL as suggested by RFC 3986"""
1567 url_parsed = compat_urllib_parse_urlparse(url)
1568 return url_parsed._replace(
1569 path=escape_rfc3986(url_parsed.path),
1570 params=escape_rfc3986(url_parsed.params),
1571 query=escape_rfc3986(url_parsed.query),
1572 fragment=escape_rfc3986(url_parsed.fragment)
1576 struct.pack(u'!I', 0)
1578 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1579 def struct_pack(spec, *args):
1580 if isinstance(spec, compat_str):
1581 spec = spec.encode('ascii')
1582 return struct.pack(spec, *args)
1584 def struct_unpack(spec, *args):
1585 if isinstance(spec, compat_str):
1586 spec = spec.encode('ascii')
1587 return struct.unpack(spec, *args)
1589 struct_pack = struct.pack
1590 struct_unpack = struct.unpack
1593 def read_batch_urls(batch_fd):
1595 if not isinstance(url, compat_str):
1596 url = url.decode('utf-8', 'replace')
1597 BOM_UTF8 = u'\xef\xbb\xbf'
1598 if url.startswith(BOM_UTF8):
1599 url = url[len(BOM_UTF8):]
1601 if url.startswith(('#', ';', ']')):
1605 with contextlib.closing(batch_fd) as fd:
1606 return [url for url in map(fixup, fd) if url]
1609 def urlencode_postdata(*args, **kargs):
1610 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1614 etree_iter = xml.etree.ElementTree.Element.iter
1615 except AttributeError: # Python <=2.6
1616 etree_iter = lambda n: n.findall('.//*')
1620 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1621 def doctype(self, name, pubid, system):
1622 pass # Ignore doctypes
1624 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1625 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1626 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1627 # Fix up XML parser in Python 2.x
1628 if sys.version_info < (3, 0):
1629 for n in etree_iter(tree):
1630 if n.text is not None:
1631 if not isinstance(n.text, compat_str):
1632 n.text = n.text.decode('utf-8')
1636 if sys.version_info < (3, 0) and sys.platform == 'win32':
1637 def compat_getpass(prompt, *args, **kwargs):
1638 if isinstance(prompt, compat_str):
1639 prompt = prompt.encode(preferredencoding())
1640 return getpass.getpass(prompt, *args, **kwargs)
1642 compat_getpass = getpass.getpass
1654 def strip_jsonp(code):
1655 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1658 def js_to_json(code):
1661 if key.startswith("'"):
1662 assert key.endswith("'")
1663 assert '"' not in key
1664 key = '"%s"' % key[1:-1]
1665 elif not key.startswith('"'):
1669 if value.startswith("'"):
1670 assert value.endswith("'")
1671 assert '"' not in value
1672 value = '"%s"' % value[1:-1]
1674 return m.group(1) + key + m.group(3) + value
1676 res = re.sub(r'''(?x)
1678 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1680 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1682 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1686 def qualities(quality_ids):
1687 """ Get a numeric quality value out of a list of possible values """
1690 return quality_ids.index(qid)
1696 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1699 subprocess_check_output = subprocess.check_output
1700 except AttributeError:
1701 def subprocess_check_output(*args, **kwargs):
1702 assert 'input' not in kwargs
1703 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1704 output, _ = p.communicate()
1707 raise subprocess.CalledProcessError(ret, p.args, output=output)
1711 def limit_length(s, length):
1712 """ Add ellipses to overly long strings """
1717 return s[:length - len(ELLIPSES)] + ELLIPSES