2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295 def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
300 replaced.append(c[0])
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
307 def xpath_text(node, xpath, name=None, fatal=False):
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
321 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
322 class BaseHTMLParser(compat_html_parser.HTMLParser):
324 compat_html_parser.HTMLParser.__init__(self)
327 def loads(self, html):
332 class AttrParser(BaseHTMLParser):
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
340 self.watch_startpos = False
342 BaseHTMLParser.__init__(self)
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
351 def handle_starttag(self, tag, attrs):
354 self.find_startpos(None)
355 if self.attribute in attrs and attrs[self.attribute] == self.value:
358 self.watch_startpos = True
360 if not tag in self.depth: self.depth[tag] = 0
363 def handle_endtag(self, tag):
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
368 self.result.append(self.getpos())
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
379 def get_result(self):
380 if self.result is None:
382 if len(self.result) != 3:
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
391 # Hack for https://github.com/rg3/youtube-dl/issues/662
392 if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
398 def get_element_by_id(id, html):
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
402 def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
407 except compat_html_parser.HTMLParseError:
409 return parser.get_result()
411 class MetaParser(BaseHTMLParser):
413 Modified HTMLParser that isolates a meta tag with the specified name
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
422 def handle_starttag(self, tag, attrs):
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
429 def get_result(self):
432 def get_meta_content(name, html):
434 Return the content attribute from the meta tag with the given name attribute.
436 parser = MetaParser(name)
439 except compat_html_parser.HTMLParseError:
441 return parser.get_result()
444 def clean_html(html):
445 """Clean an HTML snippet into a readable string"""
447 html = html.replace('\n', ' ')
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
457 def sanitize_open(filename, open_mode):
458 """Try to open the given filename, and slightly tweak it if this fails.
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
465 It returns the tuple (stream, definitive_file_name).
469 if sys.platform == 'win32':
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
476 if err.errno in (errno.EACCES,):
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
484 if alt_filename == filename:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
492 def timeconvert(timestr):
493 """Convert RFC 2822 defined time string into system timestamp"""
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
500 def sanitize_filename(s, restricted=False, is_id=False):
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
509 return '' if restricted else '\''
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
516 if restricted and ord(char) > 127:
520 result = u''.join(map(replace_insane, s))
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
532 def orderedSet(iterable):
533 """ Remove all duplicates from the input iterable """
541 def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
547 mobj = re.match(r'#(x?[0-9]+)', entity)
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
552 numstr = u'0%s' % numstr
555 return compat_chr(int(numstr, base))
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
564 assert type(s) == compat_str
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
570 def encodeFilename(s, for_subprocess=False):
572 @param s The name of the file
575 assert type(s) == compat_str
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
585 if not for_subprocess:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
592 encoding = sys.getfilesystemencoding()
595 return s.encode(encoding, 'ignore')
598 def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
607 def decodeOption(optval):
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
613 assert isinstance(optval, compat_str)
616 def formatSeconds(secs):
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
620 return '%d:%02d' % (secs // 60, secs % 60)
625 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
626 if sys.version_info < (3, 2):
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
634 sock = socket.create_connection((self.host, self.port), self.timeout)
635 if getattr(self, '_tunnel_host', False):
639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
646 return HTTPSHandlerV3(**kwargs)
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
655 context.verify_mode = (ssl.CERT_NONE
656 if opts_no_check_certificate
657 else ssl.CERT_REQUIRED)
658 context.set_default_verify_paths()
660 context.load_default_certs()
661 except AttributeError:
663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
665 class ExtractorError(Exception):
666 """Error during info extraction."""
667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
674 if video_id is not None:
675 msg = video_id + ': ' + msg
677 msg += u' (caused by %r)' % cause
679 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
680 super(ExtractorError, self).__init__(msg)
683 self.exc_info = sys.exc_info() # preserve original exception
685 self.video_id = video_id
687 def format_traceback(self):
688 if self.traceback is None:
690 return u''.join(traceback.format_tb(self.traceback))
693 class RegexNotFoundError(ExtractorError):
694 """Error when a regex didn't match"""
698 class DownloadError(Exception):
699 """Download Error exception.
701 This exception may be thrown by FileDownloader objects if they are not
702 configured to continue on errors. They will contain the appropriate
705 def __init__(self, msg, exc_info=None):
706 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
707 super(DownloadError, self).__init__(msg)
708 self.exc_info = exc_info
711 class SameFileError(Exception):
712 """Same File exception.
714 This exception will be thrown by FileDownloader objects if they detect
715 multiple files would have to be downloaded to the same file on disk.
720 class PostProcessingError(Exception):
721 """Post Processing exception.
723 This exception may be raised by PostProcessor's .run() method to
724 indicate an error in the postprocessing task.
726 def __init__(self, msg):
729 class MaxDownloadsReached(Exception):
730 """ --max-downloads limit has been reached. """
734 class UnavailableVideoError(Exception):
735 """Unavailable Format exception.
737 This exception will be thrown when a video is requested
738 in a format that is not available for that video.
743 class ContentTooShortError(Exception):
744 """Content Too Short exception.
746 This exception may be raised by FileDownloader objects when a file they
747 download is too small for what the server announced first, indicating
748 the connection was probably interrupted.
754 def __init__(self, downloaded, expected):
755 self.downloaded = downloaded
756 self.expected = expected
758 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
759 """Handler for HTTP requests and responses.
761 This class, when installed with an OpenerDirector, automatically adds
762 the standard headers to every HTTP request and handles gzipped and
763 deflated responses from web servers. If compression is to be avoided in
764 a particular request, the original request in the program code only has
765 to include the HTTP header "Youtubedl-No-Compression", which will be
766 removed before making the real request.
768 Part of this code was copied from:
770 http://techknack.net/python-urllib2-handlers/
772 Andrew Rowls, the author of that code, agreed to release it to the
779 return zlib.decompress(data, -zlib.MAX_WBITS)
781 return zlib.decompress(data)
784 def addinfourl_wrapper(stream, headers, url, code):
785 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
786 return compat_urllib_request.addinfourl(stream, headers, url, code)
787 ret = compat_urllib_request.addinfourl(stream, headers, url)
791 def http_request(self, req):
792 for h, v in std_headers.items():
793 if h not in req.headers:
795 if 'Youtubedl-no-compression' in req.headers:
796 if 'Accept-encoding' in req.headers:
797 del req.headers['Accept-encoding']
798 del req.headers['Youtubedl-no-compression']
799 if 'Youtubedl-user-agent' in req.headers:
800 if 'User-agent' in req.headers:
801 del req.headers['User-agent']
802 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
803 del req.headers['Youtubedl-user-agent']
805 if sys.version_info < (2, 7) and '#' in req.get_full_url():
806 # Python 2.6 is brain-dead when it comes to fragments
807 req._Request__original = req._Request__original.partition('#')[0]
808 req._Request__r_type = req._Request__r_type.partition('#')[0]
812 def http_response(self, req, resp):
815 if resp.headers.get('Content-encoding', '') == 'gzip':
816 content = resp.read()
817 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
819 uncompressed = io.BytesIO(gz.read())
820 except IOError as original_ioerror:
821 # There may be junk add the end of the file
822 # See http://stackoverflow.com/q/4928560/35070 for details
823 for i in range(1, 1024):
825 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
826 uncompressed = io.BytesIO(gz.read())
831 raise original_ioerror
832 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
833 resp.msg = old_resp.msg
835 if resp.headers.get('Content-encoding', '') == 'deflate':
836 gz = io.BytesIO(self.deflate(resp.read()))
837 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
838 resp.msg = old_resp.msg
841 https_request = http_request
842 https_response = http_response
845 def parse_iso8601(date_str, delimiter='T'):
846 """ Return a UNIX timestamp from the given date """
852 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
855 timezone = datetime.timedelta()
857 date_str = date_str[:-len(m.group(0))]
858 if not m.group('sign'):
859 timezone = datetime.timedelta()
861 sign = 1 if m.group('sign') == '+' else -1
862 timezone = datetime.timedelta(
863 hours=sign * int(m.group('hours')),
864 minutes=sign * int(m.group('minutes')))
865 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
866 dt = datetime.datetime.strptime(date_str, date_format) - timezone
867 return calendar.timegm(dt.timetuple())
870 def unified_strdate(date_str):
871 """Return a string with the date in the format YYYYMMDD"""
878 date_str = date_str.replace(',', ' ')
879 # %z (UTC offset) is only supported in python>=3.2
880 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
881 format_expressions = [
886 '%b %dst %Y %I:%M%p',
887 '%b %dnd %Y %I:%M%p',
888 '%b %dth %Y %I:%M%p',
899 '%Y-%m-%dT%H:%M:%SZ',
900 '%Y-%m-%dT%H:%M:%S.%fZ',
901 '%Y-%m-%dT%H:%M:%S.%f0Z',
903 '%Y-%m-%dT%H:%M:%S.%f',
906 for expression in format_expressions:
908 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
911 if upload_date is None:
912 timetuple = email.utils.parsedate_tz(date_str)
914 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
917 def determine_ext(url, default_ext=u'unknown_video'):
920 guess = url.partition(u'?')[0].rpartition(u'.')[2]
921 if re.match(r'^[A-Za-z0-9]+$', guess):
926 def subtitles_filename(filename, sub_lang, sub_format):
927 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
929 def date_from_str(date_str):
931 Return a datetime object from a string in the format YYYYMMDD or
932 (now|today)[+-][0-9](day|week|month|year)(s)?"""
933 today = datetime.date.today()
934 if date_str == 'now'or date_str == 'today':
936 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
937 if match is not None:
938 sign = match.group('sign')
939 time = int(match.group('time'))
942 unit = match.group('unit')
951 delta = datetime.timedelta(**{unit: time})
953 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
955 def hyphenate_date(date_str):
957 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
958 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
959 if match is not None:
960 return '-'.join(match.groups())
964 class DateRange(object):
965 """Represents a time interval between two dates"""
966 def __init__(self, start=None, end=None):
967 """start and end must be strings in the format accepted by date"""
968 if start is not None:
969 self.start = date_from_str(start)
971 self.start = datetime.datetime.min.date()
973 self.end = date_from_str(end)
975 self.end = datetime.datetime.max.date()
976 if self.start > self.end:
977 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
980 """Returns a range that only contains the given day"""
982 def __contains__(self, date):
983 """Check if the date is in the range"""
984 if not isinstance(date, datetime.date):
985 date = date_from_str(date)
986 return self.start <= date <= self.end
988 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
992 """ Returns the platform name as a compat_str """
993 res = platform.platform()
994 if isinstance(res, bytes):
995 res = res.decode(preferredencoding())
997 assert isinstance(res, compat_str)
1001 def _windows_write_string(s, out):
1002 """ Returns True if the string was written using special methods,
1003 False if it has yet to be written out."""
1004 # Adapted from http://stackoverflow.com/a/3259271/35070
1007 import ctypes.wintypes
1015 fileno = out.fileno()
1016 except AttributeError:
1017 # If the output stream doesn't have a fileno, it's virtual
1019 if fileno not in WIN_OUTPUT_IDS:
1022 GetStdHandle = ctypes.WINFUNCTYPE(
1023 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1024 ("GetStdHandle", ctypes.windll.kernel32))
1025 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1027 WriteConsoleW = ctypes.WINFUNCTYPE(
1028 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1029 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1030 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1031 written = ctypes.wintypes.DWORD(0)
1033 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1034 FILE_TYPE_CHAR = 0x0002
1035 FILE_TYPE_REMOTE = 0x8000
1036 GetConsoleMode = ctypes.WINFUNCTYPE(
1037 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1038 ctypes.POINTER(ctypes.wintypes.DWORD))(
1039 ("GetConsoleMode", ctypes.windll.kernel32))
1040 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1042 def not_a_console(handle):
1043 if handle == INVALID_HANDLE_VALUE or handle is None:
1045 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1046 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1048 if not_a_console(h):
1051 def next_nonbmp_pos(s):
1053 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1054 except StopIteration:
1058 count = min(next_nonbmp_pos(s), 1024)
1060 ret = WriteConsoleW(
1061 h, s, count if count else 2, ctypes.byref(written), None)
1063 raise OSError('Failed to write string')
1064 if not count: # We just wrote a non-BMP character
1065 assert written.value == 2
1068 assert written.value > 0
1069 s = s[written.value:]
1073 def write_string(s, out=None, encoding=None):
1076 assert type(s) == compat_str
1078 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1079 if _windows_write_string(s, out):
1082 if ('b' in getattr(out, 'mode', '') or
1083 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1084 byt = s.encode(encoding or preferredencoding(), 'ignore')
1086 elif hasattr(out, 'buffer'):
1087 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1088 byt = s.encode(enc, 'ignore')
1089 out.buffer.write(byt)
1095 def bytes_to_intlist(bs):
1098 if isinstance(bs[0], int): # Python 3
1101 return [ord(c) for c in bs]
1104 def intlist_to_bytes(xs):
1107 if isinstance(chr(0), bytes): # Python 2
1108 return ''.join([chr(x) for x in xs])
1113 # Cross-platform file locking
1114 if sys.platform == 'win32':
1115 import ctypes.wintypes
1118 class OVERLAPPED(ctypes.Structure):
1120 ('Internal', ctypes.wintypes.LPVOID),
1121 ('InternalHigh', ctypes.wintypes.LPVOID),
1122 ('Offset', ctypes.wintypes.DWORD),
1123 ('OffsetHigh', ctypes.wintypes.DWORD),
1124 ('hEvent', ctypes.wintypes.HANDLE),
1127 kernel32 = ctypes.windll.kernel32
1128 LockFileEx = kernel32.LockFileEx
1129 LockFileEx.argtypes = [
1130 ctypes.wintypes.HANDLE, # hFile
1131 ctypes.wintypes.DWORD, # dwFlags
1132 ctypes.wintypes.DWORD, # dwReserved
1133 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1135 ctypes.POINTER(OVERLAPPED) # Overlapped
1137 LockFileEx.restype = ctypes.wintypes.BOOL
1138 UnlockFileEx = kernel32.UnlockFileEx
1139 UnlockFileEx.argtypes = [
1140 ctypes.wintypes.HANDLE, # hFile
1141 ctypes.wintypes.DWORD, # dwReserved
1142 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1143 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1144 ctypes.POINTER(OVERLAPPED) # Overlapped
1146 UnlockFileEx.restype = ctypes.wintypes.BOOL
1147 whole_low = 0xffffffff
1148 whole_high = 0x7fffffff
1150 def _lock_file(f, exclusive):
1151 overlapped = OVERLAPPED()
1152 overlapped.Offset = 0
1153 overlapped.OffsetHigh = 0
1154 overlapped.hEvent = 0
1155 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1156 handle = msvcrt.get_osfhandle(f.fileno())
1157 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1158 whole_low, whole_high, f._lock_file_overlapped_p):
1159 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1161 def _unlock_file(f):
1162 assert f._lock_file_overlapped_p
1163 handle = msvcrt.get_osfhandle(f.fileno())
1164 if not UnlockFileEx(handle, 0,
1165 whole_low, whole_high, f._lock_file_overlapped_p):
1166 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1171 def _lock_file(f, exclusive):
1172 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1174 def _unlock_file(f):
1175 fcntl.flock(f, fcntl.LOCK_UN)
1178 class locked_file(object):
1179 def __init__(self, filename, mode, encoding=None):
1180 assert mode in ['r', 'a', 'w']
1181 self.f = io.open(filename, mode, encoding=encoding)
1184 def __enter__(self):
1185 exclusive = self.mode != 'r'
1187 _lock_file(self.f, exclusive)
1193 def __exit__(self, etype, value, traceback):
1195 _unlock_file(self.f)
1202 def write(self, *args):
1203 return self.f.write(*args)
1205 def read(self, *args):
1206 return self.f.read(*args)
1209 def shell_quote(args):
1211 encoding = sys.getfilesystemencoding()
1212 if encoding is None:
1215 if isinstance(a, bytes):
1216 # We may get a filename encoded with 'encodeFilename'
1217 a = a.decode(encoding)
1218 quoted_args.append(pipes.quote(a))
1219 return u' '.join(quoted_args)
1222 def takewhile_inclusive(pred, seq):
1223 """ Like itertools.takewhile, but include the latest evaluated element
1224 (the first element so that Not pred(e)) """
1231 def smuggle_url(url, data):
1232 """ Pass additional data in a URL for internal use. """
1234 sdata = compat_urllib_parse.urlencode(
1235 {u'__youtubedl_smuggle': json.dumps(data)})
1236 return url + u'#' + sdata
1239 def unsmuggle_url(smug_url, default=None):
1240 if not '#__youtubedl_smuggle' in smug_url:
1241 return smug_url, default
1242 url, _, sdata = smug_url.rpartition(u'#')
1243 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1244 data = json.loads(jsond)
1248 def format_bytes(bytes):
1251 if type(bytes) is str:
1252 bytes = float(bytes)
1256 exponent = int(math.log(bytes, 1024.0))
1257 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1258 converted = float(bytes) / float(1024 ** exponent)
1259 return u'%.2f%s' % (converted, suffix)
1262 def get_term_width():
1263 columns = os.environ.get('COLUMNS', None)
1268 sp = subprocess.Popen(
1270 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1271 out, err = sp.communicate()
1272 return int(out.split()[1])
1278 def month_by_name(name):
1279 """ Return the number of a month by (locale-independently) English name """
1282 u'January', u'February', u'March', u'April', u'May', u'June',
1283 u'July', u'August', u'September', u'October', u'November', u'December']
1285 return ENGLISH_NAMES.index(name) + 1
1290 def fix_xml_ampersands(xml_str):
1291 """Replace all the '&' by '&' in XML"""
1293 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1298 def setproctitle(title):
1299 assert isinstance(title, compat_str)
1301 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1304 title_bytes = title.encode('utf-8')
1305 buf = ctypes.create_string_buffer(len(title_bytes))
1306 buf.value = title_bytes
1308 libc.prctl(15, buf, 0, 0, 0)
1309 except AttributeError:
1310 return # Strange libc, just skip this
1313 def remove_start(s, start):
1314 if s.startswith(start):
1315 return s[len(start):]
1319 def remove_end(s, end):
1321 return s[:-len(end)]
1325 def url_basename(url):
1326 path = compat_urlparse.urlparse(url).path
1327 return path.strip(u'/').split(u'/')[-1]
1330 class HEADRequest(compat_urllib_request.Request):
1331 def get_method(self):
1335 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1338 v = getattr(v, get_attr, None)
1341 return default if v is None else (int(v) * invscale // scale)
1344 def str_or_none(v, default=None):
1345 return default if v is None else compat_str(v)
1348 def str_to_int(int_str):
1349 """ A more relaxed version of int_or_none """
1352 int_str = re.sub(r'[,\.\+]', u'', int_str)
1356 def float_or_none(v, scale=1, invscale=1, default=None):
1357 return default if v is None else (float(v) * invscale / scale)
1360 def parse_duration(s):
1367 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1370 res = int(m.group('secs'))
1372 res += int(m.group('mins')) * 60
1373 if m.group('hours'):
1374 res += int(m.group('hours')) * 60 * 60
1376 res += float(m.group('ms'))
1380 def prepend_extension(filename, ext):
1381 name, real_ext = os.path.splitext(filename)
1382 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1385 def check_executable(exe, args=[]):
1386 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1387 args can be a list of arguments for a short output (like -version) """
1389 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1395 class PagedList(object):
1397 # This is only useful for tests
1398 return len(self.getslice())
1401 class OnDemandPagedList(PagedList):
1402 def __init__(self, pagefunc, pagesize):
1403 self._pagefunc = pagefunc
1404 self._pagesize = pagesize
1406 def getslice(self, start=0, end=None):
1408 for pagenum in itertools.count(start // self._pagesize):
1409 firstid = pagenum * self._pagesize
1410 nextfirstid = pagenum * self._pagesize + self._pagesize
1411 if start >= nextfirstid:
1414 page_results = list(self._pagefunc(pagenum))
1417 start % self._pagesize
1418 if firstid <= start < nextfirstid
1422 ((end - 1) % self._pagesize) + 1
1423 if (end is not None and firstid <= end <= nextfirstid)
1426 if startv != 0 or endv is not None:
1427 page_results = page_results[startv:endv]
1428 res.extend(page_results)
1430 # A little optimization - if current page is not "full", ie. does
1431 # not contain page_size videos then we can assume that this page
1432 # is the last one - there are no more ids on further pages -
1433 # i.e. no need to query again.
1434 if len(page_results) + startv < self._pagesize:
1437 # If we got the whole page, but the next page is not interesting,
1438 # break out early as well
1439 if end == nextfirstid:
1444 class InAdvancePagedList(PagedList):
1445 def __init__(self, pagefunc, pagecount, pagesize):
1446 self._pagefunc = pagefunc
1447 self._pagecount = pagecount
1448 self._pagesize = pagesize
1450 def getslice(self, start=0, end=None):
1452 start_page = start // self._pagesize
1454 self._pagecount if end is None else (end // self._pagesize + 1))
1455 skip_elems = start - start_page * self._pagesize
1456 only_more = None if end is None else end - start
1457 for pagenum in range(start_page, end_page):
1458 page = list(self._pagefunc(pagenum))
1460 page = page[skip_elems:]
1462 if only_more is not None:
1463 if len(page) < only_more:
1464 only_more -= len(page)
1466 page = page[:only_more]
1473 def uppercase_escape(s):
1474 unicode_escape = codecs.getdecoder('unicode_escape')
1476 r'\\U[0-9a-fA-F]{8}',
1477 lambda m: unicode_escape(m.group(0))[0],
1481 def escape_rfc3986(s):
1482 """Escape non-ASCII characters as suggested by RFC 3986"""
1483 if sys.version_info < (3, 0) and isinstance(s, unicode):
1484 s = s.encode('utf-8')
1485 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1488 def escape_url(url):
1489 """Escape URL as suggested by RFC 3986"""
1490 url_parsed = compat_urllib_parse_urlparse(url)
1491 return url_parsed._replace(
1492 path=escape_rfc3986(url_parsed.path),
1493 params=escape_rfc3986(url_parsed.params),
1494 query=escape_rfc3986(url_parsed.query),
1495 fragment=escape_rfc3986(url_parsed.fragment)
1499 struct.pack(u'!I', 0)
1501 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1502 def struct_pack(spec, *args):
1503 if isinstance(spec, compat_str):
1504 spec = spec.encode('ascii')
1505 return struct.pack(spec, *args)
1507 def struct_unpack(spec, *args):
1508 if isinstance(spec, compat_str):
1509 spec = spec.encode('ascii')
1510 return struct.unpack(spec, *args)
1512 struct_pack = struct.pack
1513 struct_unpack = struct.unpack
1516 def read_batch_urls(batch_fd):
1518 if not isinstance(url, compat_str):
1519 url = url.decode('utf-8', 'replace')
1520 BOM_UTF8 = u'\xef\xbb\xbf'
1521 if url.startswith(BOM_UTF8):
1522 url = url[len(BOM_UTF8):]
1524 if url.startswith(('#', ';', ']')):
1528 with contextlib.closing(batch_fd) as fd:
1529 return [url for url in map(fixup, fd) if url]
1532 def urlencode_postdata(*args, **kargs):
1533 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1537 etree_iter = xml.etree.ElementTree.Element.iter
1538 except AttributeError: # Python <=2.6
1539 etree_iter = lambda n: n.findall('.//*')
1543 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1544 def doctype(self, name, pubid, system):
1545 pass # Ignore doctypes
1547 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1548 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1549 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1550 # Fix up XML parser in Python 2.x
1551 if sys.version_info < (3, 0):
1552 for n in etree_iter(tree):
1553 if n.text is not None:
1554 if not isinstance(n.text, compat_str):
1555 n.text = n.text.decode('utf-8')
1559 if sys.version_info < (3, 0) and sys.platform == 'win32':
1560 def compat_getpass(prompt, *args, **kwargs):
1561 if isinstance(prompt, compat_str):
1562 prompt = prompt.encode(preferredencoding())
1563 return getpass.getpass(prompt, *args, **kwargs)
1565 compat_getpass = getpass.getpass
1577 def strip_jsonp(code):
1578 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1581 def js_to_json(code):
1584 if v in ('true', 'false', 'null'):
1586 if v.startswith('"'):
1588 if v.startswith("'"):
1590 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1597 res = re.sub(r'''(?x)
1598 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1599 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1600 [a-zA-Z_][a-zA-Z_0-9]*
1602 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1606 def qualities(quality_ids):
1607 """ Get a numeric quality value out of a list of possible values """
1610 return quality_ids.index(qid)
1616 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1619 subprocess_check_output = subprocess.check_output
1620 except AttributeError:
1621 def subprocess_check_output(*args, **kwargs):
1622 assert 'input' not in kwargs
1623 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1624 output, _ = p.communicate()
1627 raise subprocess.CalledProcessError(ret, p.args, output=output)
1631 def limit_length(s, length):
1632 """ Add ellipses to overly long strings """
1637 return s[:length - len(ELLIPSES)] + ELLIPSES