2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295 def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
300 replaced.append(c[0])
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
307 def xpath_text(node, xpath, name=None, fatal=False):
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
321 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
322 class BaseHTMLParser(compat_html_parser.HTMLParser):
324 compat_html_parser.HTMLParser.__init__(self)
327 def loads(self, html):
332 class AttrParser(BaseHTMLParser):
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
340 self.watch_startpos = False
342 BaseHTMLParser.__init__(self)
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
351 def handle_starttag(self, tag, attrs):
354 self.find_startpos(None)
355 if self.attribute in attrs and attrs[self.attribute] == self.value:
358 self.watch_startpos = True
360 if not tag in self.depth: self.depth[tag] = 0
363 def handle_endtag(self, tag):
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
368 self.result.append(self.getpos())
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
379 def get_result(self):
380 if self.result is None:
382 if len(self.result) != 3:
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
391 # Hack for https://github.com/rg3/youtube-dl/issues/662
392 if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
398 def get_element_by_id(id, html):
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
402 def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
407 except compat_html_parser.HTMLParseError:
409 return parser.get_result()
411 class MetaParser(BaseHTMLParser):
413 Modified HTMLParser that isolates a meta tag with the specified name
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
422 def handle_starttag(self, tag, attrs):
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
429 def get_result(self):
432 def get_meta_content(name, html):
434 Return the content attribute from the meta tag with the given name attribute.
436 parser = MetaParser(name)
439 except compat_html_parser.HTMLParseError:
441 return parser.get_result()
444 def clean_html(html):
445 """Clean an HTML snippet into a readable string"""
447 html = html.replace('\n', ' ')
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
457 def sanitize_open(filename, open_mode):
458 """Try to open the given filename, and slightly tweak it if this fails.
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
465 It returns the tuple (stream, definitive_file_name).
469 if sys.platform == 'win32':
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
476 if err.errno in (errno.EACCES,):
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
484 if alt_filename == filename:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
492 def timeconvert(timestr):
493 """Convert RFC 2822 defined time string into system timestamp"""
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
500 def sanitize_filename(s, restricted=False, is_id=False):
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
509 return '' if restricted else '\''
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
516 if restricted and ord(char) > 127:
520 result = u''.join(map(replace_insane, s))
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
532 def orderedSet(iterable):
533 """ Remove all duplicates from the input iterable """
541 def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
547 mobj = re.match(r'#(x?[0-9]+)', entity)
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
552 numstr = u'0%s' % numstr
555 return compat_chr(int(numstr, base))
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
564 assert type(s) == compat_str
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
570 def encodeFilename(s, for_subprocess=False):
572 @param s The name of the file
575 assert type(s) == compat_str
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
585 if not for_subprocess:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
592 encoding = sys.getfilesystemencoding()
595 return s.encode(encoding, 'ignore')
598 def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
607 def decodeOption(optval):
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
613 assert isinstance(optval, compat_str)
616 def formatSeconds(secs):
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
620 return '%d:%02d' % (secs // 60, secs % 60)
625 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
626 if sys.version_info < (3, 2):
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
634 sock = socket.create_connection((self.host, self.port), self.timeout)
635 if getattr(self, '_tunnel_host', False):
639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
646 return HTTPSHandlerV3(**kwargs)
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
655 context.verify_mode = (ssl.CERT_NONE
656 if opts_no_check_certificate
657 else ssl.CERT_REQUIRED)
658 context.set_default_verify_paths()
660 context.load_default_certs()
661 except AttributeError:
663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
665 class ExtractorError(Exception):
666 """Error during info extraction."""
667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
674 if video_id is not None:
675 msg = video_id + ': ' + msg
677 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
678 super(ExtractorError, self).__init__(msg)
681 self.exc_info = sys.exc_info() # preserve original exception
683 self.video_id = video_id
685 def format_traceback(self):
686 if self.traceback is None:
688 return u''.join(traceback.format_tb(self.traceback))
691 class RegexNotFoundError(ExtractorError):
692 """Error when a regex didn't match"""
696 class DownloadError(Exception):
697 """Download Error exception.
699 This exception may be thrown by FileDownloader objects if they are not
700 configured to continue on errors. They will contain the appropriate
703 def __init__(self, msg, exc_info=None):
704 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
705 super(DownloadError, self).__init__(msg)
706 self.exc_info = exc_info
709 class SameFileError(Exception):
710 """Same File exception.
712 This exception will be thrown by FileDownloader objects if they detect
713 multiple files would have to be downloaded to the same file on disk.
718 class PostProcessingError(Exception):
719 """Post Processing exception.
721 This exception may be raised by PostProcessor's .run() method to
722 indicate an error in the postprocessing task.
724 def __init__(self, msg):
727 class MaxDownloadsReached(Exception):
728 """ --max-downloads limit has been reached. """
732 class UnavailableVideoError(Exception):
733 """Unavailable Format exception.
735 This exception will be thrown when a video is requested
736 in a format that is not available for that video.
741 class ContentTooShortError(Exception):
742 """Content Too Short exception.
744 This exception may be raised by FileDownloader objects when a file they
745 download is too small for what the server announced first, indicating
746 the connection was probably interrupted.
752 def __init__(self, downloaded, expected):
753 self.downloaded = downloaded
754 self.expected = expected
756 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
757 """Handler for HTTP requests and responses.
759 This class, when installed with an OpenerDirector, automatically adds
760 the standard headers to every HTTP request and handles gzipped and
761 deflated responses from web servers. If compression is to be avoided in
762 a particular request, the original request in the program code only has
763 to include the HTTP header "Youtubedl-No-Compression", which will be
764 removed before making the real request.
766 Part of this code was copied from:
768 http://techknack.net/python-urllib2-handlers/
770 Andrew Rowls, the author of that code, agreed to release it to the
777 return zlib.decompress(data, -zlib.MAX_WBITS)
779 return zlib.decompress(data)
782 def addinfourl_wrapper(stream, headers, url, code):
783 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
784 return compat_urllib_request.addinfourl(stream, headers, url, code)
785 ret = compat_urllib_request.addinfourl(stream, headers, url)
789 def http_request(self, req):
790 for h, v in std_headers.items():
791 if h not in req.headers:
793 if 'Youtubedl-no-compression' in req.headers:
794 if 'Accept-encoding' in req.headers:
795 del req.headers['Accept-encoding']
796 del req.headers['Youtubedl-no-compression']
797 if 'Youtubedl-user-agent' in req.headers:
798 if 'User-agent' in req.headers:
799 del req.headers['User-agent']
800 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
801 del req.headers['Youtubedl-user-agent']
803 if sys.version_info < (2, 7) and '#' in req.get_full_url():
804 # Python 2.6 is brain-dead when it comes to fragments
805 req._Request__original = req._Request__original.partition('#')[0]
806 req._Request__r_type = req._Request__r_type.partition('#')[0]
810 def http_response(self, req, resp):
813 if resp.headers.get('Content-encoding', '') == 'gzip':
814 content = resp.read()
815 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
817 uncompressed = io.BytesIO(gz.read())
818 except IOError as original_ioerror:
819 # There may be junk add the end of the file
820 # See http://stackoverflow.com/q/4928560/35070 for details
821 for i in range(1, 1024):
823 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
824 uncompressed = io.BytesIO(gz.read())
829 raise original_ioerror
830 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
831 resp.msg = old_resp.msg
833 if resp.headers.get('Content-encoding', '') == 'deflate':
834 gz = io.BytesIO(self.deflate(resp.read()))
835 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
836 resp.msg = old_resp.msg
839 https_request = http_request
840 https_response = http_response
843 def parse_iso8601(date_str, delimiter='T'):
844 """ Return a UNIX timestamp from the given date """
850 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
853 timezone = datetime.timedelta()
855 date_str = date_str[:-len(m.group(0))]
856 if not m.group('sign'):
857 timezone = datetime.timedelta()
859 sign = 1 if m.group('sign') == '+' else -1
860 timezone = datetime.timedelta(
861 hours=sign * int(m.group('hours')),
862 minutes=sign * int(m.group('minutes')))
863 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
864 dt = datetime.datetime.strptime(date_str, date_format) - timezone
865 return calendar.timegm(dt.timetuple())
868 def unified_strdate(date_str):
869 """Return a string with the date in the format YYYYMMDD"""
876 date_str = date_str.replace(',', ' ')
877 # %z (UTC offset) is only supported in python>=3.2
878 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
879 format_expressions = [
884 '%b %dst %Y %I:%M%p',
885 '%b %dnd %Y %I:%M%p',
886 '%b %dth %Y %I:%M%p',
897 '%Y-%m-%dT%H:%M:%SZ',
898 '%Y-%m-%dT%H:%M:%S.%fZ',
899 '%Y-%m-%dT%H:%M:%S.%f0Z',
901 '%Y-%m-%dT%H:%M:%S.%f',
904 for expression in format_expressions:
906 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
909 if upload_date is None:
910 timetuple = email.utils.parsedate_tz(date_str)
912 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
915 def determine_ext(url, default_ext=u'unknown_video'):
918 guess = url.partition(u'?')[0].rpartition(u'.')[2]
919 if re.match(r'^[A-Za-z0-9]+$', guess):
924 def subtitles_filename(filename, sub_lang, sub_format):
925 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
927 def date_from_str(date_str):
929 Return a datetime object from a string in the format YYYYMMDD or
930 (now|today)[+-][0-9](day|week|month|year)(s)?"""
931 today = datetime.date.today()
932 if date_str == 'now'or date_str == 'today':
934 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
935 if match is not None:
936 sign = match.group('sign')
937 time = int(match.group('time'))
940 unit = match.group('unit')
949 delta = datetime.timedelta(**{unit: time})
951 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
953 def hyphenate_date(date_str):
955 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
956 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
957 if match is not None:
958 return '-'.join(match.groups())
962 class DateRange(object):
963 """Represents a time interval between two dates"""
964 def __init__(self, start=None, end=None):
965 """start and end must be strings in the format accepted by date"""
966 if start is not None:
967 self.start = date_from_str(start)
969 self.start = datetime.datetime.min.date()
971 self.end = date_from_str(end)
973 self.end = datetime.datetime.max.date()
974 if self.start > self.end:
975 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
978 """Returns a range that only contains the given day"""
980 def __contains__(self, date):
981 """Check if the date is in the range"""
982 if not isinstance(date, datetime.date):
983 date = date_from_str(date)
984 return self.start <= date <= self.end
986 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
990 """ Returns the platform name as a compat_str """
991 res = platform.platform()
992 if isinstance(res, bytes):
993 res = res.decode(preferredencoding())
995 assert isinstance(res, compat_str)
999 def _windows_write_string(s, out):
1000 """ Returns True if the string was written using special methods,
1001 False if it has yet to be written out."""
1002 # Adapted from http://stackoverflow.com/a/3259271/35070
1005 import ctypes.wintypes
1013 fileno = out.fileno()
1014 except AttributeError:
1015 # If the output stream doesn't have a fileno, it's virtual
1017 if fileno not in WIN_OUTPUT_IDS:
1020 GetStdHandle = ctypes.WINFUNCTYPE(
1021 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1022 ("GetStdHandle", ctypes.windll.kernel32))
1023 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1025 WriteConsoleW = ctypes.WINFUNCTYPE(
1026 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1027 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1028 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1029 written = ctypes.wintypes.DWORD(0)
1031 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1032 FILE_TYPE_CHAR = 0x0002
1033 FILE_TYPE_REMOTE = 0x8000
1034 GetConsoleMode = ctypes.WINFUNCTYPE(
1035 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1036 ctypes.POINTER(ctypes.wintypes.DWORD))(
1037 ("GetConsoleMode", ctypes.windll.kernel32))
1038 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1040 def not_a_console(handle):
1041 if handle == INVALID_HANDLE_VALUE or handle is None:
1043 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1044 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1046 if not_a_console(h):
1049 def next_nonbmp_pos(s):
1051 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1052 except StopIteration:
1056 count = min(next_nonbmp_pos(s), 1024)
1058 ret = WriteConsoleW(
1059 h, s, count if count else 2, ctypes.byref(written), None)
1061 raise OSError('Failed to write string')
1062 if not count: # We just wrote a non-BMP character
1063 assert written.value == 2
1066 assert written.value > 0
1067 s = s[written.value:]
1071 def write_string(s, out=None, encoding=None):
1074 assert type(s) == compat_str
1076 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1077 if _windows_write_string(s, out):
1080 if ('b' in getattr(out, 'mode', '') or
1081 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1082 byt = s.encode(encoding or preferredencoding(), 'ignore')
1084 elif hasattr(out, 'buffer'):
1085 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1086 byt = s.encode(enc, 'ignore')
1087 out.buffer.write(byt)
1093 def bytes_to_intlist(bs):
1096 if isinstance(bs[0], int): # Python 3
1099 return [ord(c) for c in bs]
1102 def intlist_to_bytes(xs):
1105 if isinstance(chr(0), bytes): # Python 2
1106 return ''.join([chr(x) for x in xs])
1111 # Cross-platform file locking
1112 if sys.platform == 'win32':
1113 import ctypes.wintypes
1116 class OVERLAPPED(ctypes.Structure):
1118 ('Internal', ctypes.wintypes.LPVOID),
1119 ('InternalHigh', ctypes.wintypes.LPVOID),
1120 ('Offset', ctypes.wintypes.DWORD),
1121 ('OffsetHigh', ctypes.wintypes.DWORD),
1122 ('hEvent', ctypes.wintypes.HANDLE),
1125 kernel32 = ctypes.windll.kernel32
1126 LockFileEx = kernel32.LockFileEx
1127 LockFileEx.argtypes = [
1128 ctypes.wintypes.HANDLE, # hFile
1129 ctypes.wintypes.DWORD, # dwFlags
1130 ctypes.wintypes.DWORD, # dwReserved
1131 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1132 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1133 ctypes.POINTER(OVERLAPPED) # Overlapped
1135 LockFileEx.restype = ctypes.wintypes.BOOL
1136 UnlockFileEx = kernel32.UnlockFileEx
1137 UnlockFileEx.argtypes = [
1138 ctypes.wintypes.HANDLE, # hFile
1139 ctypes.wintypes.DWORD, # dwReserved
1140 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1141 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1142 ctypes.POINTER(OVERLAPPED) # Overlapped
1144 UnlockFileEx.restype = ctypes.wintypes.BOOL
1145 whole_low = 0xffffffff
1146 whole_high = 0x7fffffff
1148 def _lock_file(f, exclusive):
1149 overlapped = OVERLAPPED()
1150 overlapped.Offset = 0
1151 overlapped.OffsetHigh = 0
1152 overlapped.hEvent = 0
1153 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1154 handle = msvcrt.get_osfhandle(f.fileno())
1155 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1156 whole_low, whole_high, f._lock_file_overlapped_p):
1157 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1159 def _unlock_file(f):
1160 assert f._lock_file_overlapped_p
1161 handle = msvcrt.get_osfhandle(f.fileno())
1162 if not UnlockFileEx(handle, 0,
1163 whole_low, whole_high, f._lock_file_overlapped_p):
1164 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1169 def _lock_file(f, exclusive):
1170 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1172 def _unlock_file(f):
1173 fcntl.flock(f, fcntl.LOCK_UN)
1176 class locked_file(object):
1177 def __init__(self, filename, mode, encoding=None):
1178 assert mode in ['r', 'a', 'w']
1179 self.f = io.open(filename, mode, encoding=encoding)
1182 def __enter__(self):
1183 exclusive = self.mode != 'r'
1185 _lock_file(self.f, exclusive)
1191 def __exit__(self, etype, value, traceback):
1193 _unlock_file(self.f)
1200 def write(self, *args):
1201 return self.f.write(*args)
1203 def read(self, *args):
1204 return self.f.read(*args)
1207 def shell_quote(args):
1209 encoding = sys.getfilesystemencoding()
1210 if encoding is None:
1213 if isinstance(a, bytes):
1214 # We may get a filename encoded with 'encodeFilename'
1215 a = a.decode(encoding)
1216 quoted_args.append(pipes.quote(a))
1217 return u' '.join(quoted_args)
1220 def takewhile_inclusive(pred, seq):
1221 """ Like itertools.takewhile, but include the latest evaluated element
1222 (the first element so that Not pred(e)) """
1229 def smuggle_url(url, data):
1230 """ Pass additional data in a URL for internal use. """
1232 sdata = compat_urllib_parse.urlencode(
1233 {u'__youtubedl_smuggle': json.dumps(data)})
1234 return url + u'#' + sdata
1237 def unsmuggle_url(smug_url, default=None):
1238 if not '#__youtubedl_smuggle' in smug_url:
1239 return smug_url, default
1240 url, _, sdata = smug_url.rpartition(u'#')
1241 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1242 data = json.loads(jsond)
1246 def format_bytes(bytes):
1249 if type(bytes) is str:
1250 bytes = float(bytes)
1254 exponent = int(math.log(bytes, 1024.0))
1255 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1256 converted = float(bytes) / float(1024 ** exponent)
1257 return u'%.2f%s' % (converted, suffix)
1260 def get_term_width():
1261 columns = os.environ.get('COLUMNS', None)
1266 sp = subprocess.Popen(
1268 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1269 out, err = sp.communicate()
1270 return int(out.split()[1])
1276 def month_by_name(name):
1277 """ Return the number of a month by (locale-independently) English name """
1280 u'January', u'February', u'March', u'April', u'May', u'June',
1281 u'July', u'August', u'September', u'October', u'November', u'December']
1283 return ENGLISH_NAMES.index(name) + 1
1288 def fix_xml_ampersands(xml_str):
1289 """Replace all the '&' by '&' in XML"""
1291 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1296 def setproctitle(title):
1297 assert isinstance(title, compat_str)
1299 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1302 title_bytes = title.encode('utf-8')
1303 buf = ctypes.create_string_buffer(len(title_bytes))
1304 buf.value = title_bytes
1306 libc.prctl(15, buf, 0, 0, 0)
1307 except AttributeError:
1308 return # Strange libc, just skip this
1311 def remove_start(s, start):
1312 if s.startswith(start):
1313 return s[len(start):]
1317 def remove_end(s, end):
1319 return s[:-len(end)]
1323 def url_basename(url):
1324 path = compat_urlparse.urlparse(url).path
1325 return path.strip(u'/').split(u'/')[-1]
1328 class HEADRequest(compat_urllib_request.Request):
1329 def get_method(self):
1333 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1336 v = getattr(v, get_attr, None)
1339 return default if v is None else (int(v) * invscale // scale)
1342 def str_or_none(v, default=None):
1343 return default if v is None else compat_str(v)
1346 def str_to_int(int_str):
1347 """ A more relaxed version of int_or_none """
1350 int_str = re.sub(r'[,\.\+]', u'', int_str)
1354 def float_or_none(v, scale=1, invscale=1, default=None):
1355 return default if v is None else (float(v) * invscale / scale)
1358 def parse_duration(s):
1365 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1368 res = int(m.group('secs'))
1370 res += int(m.group('mins')) * 60
1371 if m.group('hours'):
1372 res += int(m.group('hours')) * 60 * 60
1374 res += float(m.group('ms'))
1378 def prepend_extension(filename, ext):
1379 name, real_ext = os.path.splitext(filename)
1380 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1383 def check_executable(exe, args=[]):
1384 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1385 args can be a list of arguments for a short output (like -version) """
1387 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1393 class PagedList(object):
1395 # This is only useful for tests
1396 return len(self.getslice())
1399 class OnDemandPagedList(PagedList):
1400 def __init__(self, pagefunc, pagesize):
1401 self._pagefunc = pagefunc
1402 self._pagesize = pagesize
1404 def getslice(self, start=0, end=None):
1406 for pagenum in itertools.count(start // self._pagesize):
1407 firstid = pagenum * self._pagesize
1408 nextfirstid = pagenum * self._pagesize + self._pagesize
1409 if start >= nextfirstid:
1412 page_results = list(self._pagefunc(pagenum))
1415 start % self._pagesize
1416 if firstid <= start < nextfirstid
1420 ((end - 1) % self._pagesize) + 1
1421 if (end is not None and firstid <= end <= nextfirstid)
1424 if startv != 0 or endv is not None:
1425 page_results = page_results[startv:endv]
1426 res.extend(page_results)
1428 # A little optimization - if current page is not "full", ie. does
1429 # not contain page_size videos then we can assume that this page
1430 # is the last one - there are no more ids on further pages -
1431 # i.e. no need to query again.
1432 if len(page_results) + startv < self._pagesize:
1435 # If we got the whole page, but the next page is not interesting,
1436 # break out early as well
1437 if end == nextfirstid:
1442 class InAdvancePagedList(PagedList):
1443 def __init__(self, pagefunc, pagecount, pagesize):
1444 self._pagefunc = pagefunc
1445 self._pagecount = pagecount
1446 self._pagesize = pagesize
1448 def getslice(self, start=0, end=None):
1450 start_page = start // self._pagesize
1452 self._pagecount if end is None else (end // self._pagesize + 1))
1453 skip_elems = start - start_page * self._pagesize
1454 only_more = None if end is None else end - start
1455 for pagenum in range(start_page, end_page):
1456 page = list(self._pagefunc(pagenum))
1458 page = page[skip_elems:]
1460 if only_more is not None:
1461 if len(page) < only_more:
1462 only_more -= len(page)
1464 page = page[:only_more]
1471 def uppercase_escape(s):
1472 unicode_escape = codecs.getdecoder('unicode_escape')
1474 r'\\U[0-9a-fA-F]{8}',
1475 lambda m: unicode_escape(m.group(0))[0],
1479 def escape_rfc3986(s):
1480 """Escape non-ASCII characters as suggested by RFC 3986"""
1481 if sys.version_info < (3, 0) and isinstance(s, unicode):
1482 s = s.encode('utf-8')
1483 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1486 def escape_url(url):
1487 """Escape URL as suggested by RFC 3986"""
1488 url_parsed = compat_urllib_parse_urlparse(url)
1489 return url_parsed._replace(
1490 path=escape_rfc3986(url_parsed.path),
1491 params=escape_rfc3986(url_parsed.params),
1492 query=escape_rfc3986(url_parsed.query),
1493 fragment=escape_rfc3986(url_parsed.fragment)
1497 struct.pack(u'!I', 0)
1499 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1500 def struct_pack(spec, *args):
1501 if isinstance(spec, compat_str):
1502 spec = spec.encode('ascii')
1503 return struct.pack(spec, *args)
1505 def struct_unpack(spec, *args):
1506 if isinstance(spec, compat_str):
1507 spec = spec.encode('ascii')
1508 return struct.unpack(spec, *args)
1510 struct_pack = struct.pack
1511 struct_unpack = struct.unpack
1514 def read_batch_urls(batch_fd):
1516 if not isinstance(url, compat_str):
1517 url = url.decode('utf-8', 'replace')
1518 BOM_UTF8 = u'\xef\xbb\xbf'
1519 if url.startswith(BOM_UTF8):
1520 url = url[len(BOM_UTF8):]
1522 if url.startswith(('#', ';', ']')):
1526 with contextlib.closing(batch_fd) as fd:
1527 return [url for url in map(fixup, fd) if url]
1530 def urlencode_postdata(*args, **kargs):
1531 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1535 etree_iter = xml.etree.ElementTree.Element.iter
1536 except AttributeError: # Python <=2.6
1537 etree_iter = lambda n: n.findall('.//*')
1541 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1542 def doctype(self, name, pubid, system):
1543 pass # Ignore doctypes
1545 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1546 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1547 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1548 # Fix up XML parser in Python 2.x
1549 if sys.version_info < (3, 0):
1550 for n in etree_iter(tree):
1551 if n.text is not None:
1552 if not isinstance(n.text, compat_str):
1553 n.text = n.text.decode('utf-8')
1557 if sys.version_info < (3, 0) and sys.platform == 'win32':
1558 def compat_getpass(prompt, *args, **kwargs):
1559 if isinstance(prompt, compat_str):
1560 prompt = prompt.encode(preferredencoding())
1561 return getpass.getpass(prompt, *args, **kwargs)
1563 compat_getpass = getpass.getpass
1575 def strip_jsonp(code):
1576 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1579 def js_to_json(code):
1582 if key.startswith("'"):
1583 assert key.endswith("'")
1584 assert '"' not in key
1585 key = '"%s"' % key[1:-1]
1586 elif not key.startswith('"'):
1590 if value.startswith("'"):
1591 assert value.endswith("'")
1592 assert '"' not in value
1593 value = '"%s"' % value[1:-1]
1595 return m.group(1) + key + m.group(3) + value
1597 res = re.sub(r'''(?x)
1599 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1601 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1603 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1607 def qualities(quality_ids):
1608 """ Get a numeric quality value out of a list of possible values """
1611 return quality_ids.index(qid)
1617 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1620 subprocess_check_output = subprocess.check_output
1621 except AttributeError:
1622 def subprocess_check_output(*args, **kwargs):
1623 assert 'input' not in kwargs
1624 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1625 output, _ = p.communicate()
1628 raise subprocess.CalledProcessError(ret, p.args, output=output)
1632 def limit_length(s, length):
1633 """ Add ellipses to overly long strings """
1638 return s[:length - len(ELLIPSES)] + ELLIPSES