2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
295 replaced.append(c[0])
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
302 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
303 class BaseHTMLParser(compat_html_parser.HTMLParser):
305 compat_html_parser.HTMLParser.__init__(self)
308 def loads(self, html):
313 class AttrParser(BaseHTMLParser):
314 """Modified HTMLParser that isolates a tag with the specified attribute"""
315 def __init__(self, attribute, value):
316 self.attribute = attribute
321 self.watch_startpos = False
323 BaseHTMLParser.__init__(self)
325 def error(self, message):
326 if self.error_count > 10 or self.started:
327 raise compat_html_parser.HTMLParseError(message, self.getpos())
328 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
329 self.error_count += 1
332 def handle_starttag(self, tag, attrs):
335 self.find_startpos(None)
336 if self.attribute in attrs and attrs[self.attribute] == self.value:
339 self.watch_startpos = True
341 if not tag in self.depth: self.depth[tag] = 0
344 def handle_endtag(self, tag):
346 if tag in self.depth: self.depth[tag] -= 1
347 if self.depth[self.result[0]] == 0:
349 self.result.append(self.getpos())
351 def find_startpos(self, x):
352 """Needed to put the start position of the result (self.result[1])
353 after the opening tag with the requested id"""
354 if self.watch_startpos:
355 self.watch_startpos = False
356 self.result.append(self.getpos())
357 handle_entityref = handle_charref = handle_data = handle_comment = \
358 handle_decl = handle_pi = unknown_decl = find_startpos
360 def get_result(self):
361 if self.result is None:
363 if len(self.result) != 3:
365 lines = self.html.split('\n')
366 lines = lines[self.result[1][0]-1:self.result[2][0]]
367 lines[0] = lines[0][self.result[1][1]:]
369 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
370 lines[-1] = lines[-1][:self.result[2][1]]
371 return '\n'.join(lines).strip()
372 # Hack for https://github.com/rg3/youtube-dl/issues/662
373 if sys.version_info < (2, 7, 3):
374 AttrParser.parse_endtag = (lambda self, i:
375 i + len("</scr'+'ipt>")
376 if self.rawdata[i:].startswith("</scr'+'ipt>")
377 else compat_html_parser.HTMLParser.parse_endtag(self, i))
379 def get_element_by_id(id, html):
380 """Return the content of the tag with the specified ID in the passed HTML document"""
381 return get_element_by_attribute("id", id, html)
383 def get_element_by_attribute(attribute, value, html):
384 """Return the content of the tag with the specified attribute in the passed HTML document"""
385 parser = AttrParser(attribute, value)
388 except compat_html_parser.HTMLParseError:
390 return parser.get_result()
392 class MetaParser(BaseHTMLParser):
394 Modified HTMLParser that isolates a meta tag with the specified name
397 def __init__(self, name):
398 BaseHTMLParser.__init__(self)
403 def handle_starttag(self, tag, attrs):
407 if attrs.get('name') == self.name:
408 self.result = attrs.get('content')
410 def get_result(self):
413 def get_meta_content(name, html):
415 Return the content attribute from the meta tag with the given name attribute.
417 parser = MetaParser(name)
420 except compat_html_parser.HTMLParseError:
422 return parser.get_result()
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
428 html = html.replace('\n', ' ')
429 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
438 def sanitize_open(filename, open_mode):
439 """Try to open the given filename, and slightly tweak it if this fails.
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
446 It returns the tuple (stream, definitive_file_name).
450 if sys.platform == 'win32':
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
457 if err.errno in (errno.EACCES,):
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = os.path.join(
462 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
463 for path_part in os.path.split(filename)
465 if alt_filename == filename:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(filename), open_mode)
470 return (stream, alt_filename)
473 def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
481 def sanitize_filename(s, restricted=False, is_id=False):
482 """Sanitizes a string so it could be used as part of a filename.
483 If restricted is set, use a stricter subset of allowed characters.
484 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
486 def replace_insane(char):
487 if char == '?' or ord(char) < 32 or ord(char) == 127:
490 return '' if restricted else '\''
492 return '_-' if restricted else ' -'
493 elif char in '\\/|*<>':
495 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
497 if restricted and ord(char) > 127:
501 result = u''.join(map(replace_insane, s))
503 while '__' in result:
504 result = result.replace('__', '_')
505 result = result.strip('_')
506 # Common case of "Foreign band name - English song title"
507 if restricted and result.startswith('-_'):
513 def orderedSet(iterable):
514 """ Remove all duplicates from the input iterable """
522 def _htmlentity_transform(entity):
523 """Transforms an HTML entity to a character."""
524 # Known non-numeric HTML entity
525 if entity in compat_html_entities.name2codepoint:
526 return compat_chr(compat_html_entities.name2codepoint[entity])
528 mobj = re.match(r'#(x?[0-9]+)', entity)
530 numstr = mobj.group(1)
531 if numstr.startswith(u'x'):
533 numstr = u'0%s' % numstr
536 return compat_chr(int(numstr, base))
538 # Unknown entity in name, return its literal representation
539 return (u'&%s;' % entity)
545 assert type(s) == compat_str
548 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
551 def encodeFilename(s, for_subprocess=False):
553 @param s The name of the file
556 assert type(s) == compat_str
558 # Python 3 has a Unicode API
559 if sys.version_info >= (3, 0):
562 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
563 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
564 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
565 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
566 if not for_subprocess:
569 # For subprocess calls, encode with locale encoding
570 # Refer to http://stackoverflow.com/a/9951851/35070
571 encoding = preferredencoding()
573 encoding = sys.getfilesystemencoding()
576 return s.encode(encoding, 'ignore')
579 def encodeArgument(s):
580 if not isinstance(s, compat_str):
581 # Legacy code that uses byte strings
582 # Uncomment the following line after fixing all post processors
583 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
584 s = s.decode('ascii')
585 return encodeFilename(s, True)
588 def decodeOption(optval):
591 if isinstance(optval, bytes):
592 optval = optval.decode(preferredencoding())
594 assert isinstance(optval, compat_str)
597 def formatSeconds(secs):
599 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
601 return '%d:%02d' % (secs // 60, secs % 60)
606 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
607 if sys.version_info < (3, 2):
610 class HTTPSConnectionV3(httplib.HTTPSConnection):
611 def __init__(self, *args, **kwargs):
612 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
615 sock = socket.create_connection((self.host, self.port), self.timeout)
616 if getattr(self, '_tunnel_host', False):
620 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
622 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
624 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
625 def https_open(self, req):
626 return self.do_open(HTTPSConnectionV3, req)
627 return HTTPSHandlerV3(**kwargs)
629 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
630 context.verify_mode = (ssl.CERT_NONE
631 if opts_no_check_certificate
632 else ssl.CERT_REQUIRED)
633 context.set_default_verify_paths()
635 context.load_default_certs()
636 except AttributeError:
638 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
640 class ExtractorError(Exception):
641 """Error during info extraction."""
642 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
643 """ tb, if given, is the original traceback (so that it can be printed out).
644 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
647 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
649 if video_id is not None:
650 msg = video_id + ': ' + msg
652 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
653 super(ExtractorError, self).__init__(msg)
656 self.exc_info = sys.exc_info() # preserve original exception
658 self.video_id = video_id
660 def format_traceback(self):
661 if self.traceback is None:
663 return u''.join(traceback.format_tb(self.traceback))
666 class RegexNotFoundError(ExtractorError):
667 """Error when a regex didn't match"""
671 class DownloadError(Exception):
672 """Download Error exception.
674 This exception may be thrown by FileDownloader objects if they are not
675 configured to continue on errors. They will contain the appropriate
678 def __init__(self, msg, exc_info=None):
679 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
680 super(DownloadError, self).__init__(msg)
681 self.exc_info = exc_info
684 class SameFileError(Exception):
685 """Same File exception.
687 This exception will be thrown by FileDownloader objects if they detect
688 multiple files would have to be downloaded to the same file on disk.
693 class PostProcessingError(Exception):
694 """Post Processing exception.
696 This exception may be raised by PostProcessor's .run() method to
697 indicate an error in the postprocessing task.
699 def __init__(self, msg):
702 class MaxDownloadsReached(Exception):
703 """ --max-downloads limit has been reached. """
707 class UnavailableVideoError(Exception):
708 """Unavailable Format exception.
710 This exception will be thrown when a video is requested
711 in a format that is not available for that video.
716 class ContentTooShortError(Exception):
717 """Content Too Short exception.
719 This exception may be raised by FileDownloader objects when a file they
720 download is too small for what the server announced first, indicating
721 the connection was probably interrupted.
727 def __init__(self, downloaded, expected):
728 self.downloaded = downloaded
729 self.expected = expected
731 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
732 """Handler for HTTP requests and responses.
734 This class, when installed with an OpenerDirector, automatically adds
735 the standard headers to every HTTP request and handles gzipped and
736 deflated responses from web servers. If compression is to be avoided in
737 a particular request, the original request in the program code only has
738 to include the HTTP header "Youtubedl-No-Compression", which will be
739 removed before making the real request.
741 Part of this code was copied from:
743 http://techknack.net/python-urllib2-handlers/
745 Andrew Rowls, the author of that code, agreed to release it to the
752 return zlib.decompress(data, -zlib.MAX_WBITS)
754 return zlib.decompress(data)
757 def addinfourl_wrapper(stream, headers, url, code):
758 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
759 return compat_urllib_request.addinfourl(stream, headers, url, code)
760 ret = compat_urllib_request.addinfourl(stream, headers, url)
764 def http_request(self, req):
765 for h, v in std_headers.items():
766 if h not in req.headers:
768 if 'Youtubedl-no-compression' in req.headers:
769 if 'Accept-encoding' in req.headers:
770 del req.headers['Accept-encoding']
771 del req.headers['Youtubedl-no-compression']
772 if 'Youtubedl-user-agent' in req.headers:
773 if 'User-agent' in req.headers:
774 del req.headers['User-agent']
775 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
776 del req.headers['Youtubedl-user-agent']
779 def http_response(self, req, resp):
782 if resp.headers.get('Content-encoding', '') == 'gzip':
783 content = resp.read()
784 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
786 uncompressed = io.BytesIO(gz.read())
787 except IOError as original_ioerror:
788 # There may be junk add the end of the file
789 # See http://stackoverflow.com/q/4928560/35070 for details
790 for i in range(1, 1024):
792 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
793 uncompressed = io.BytesIO(gz.read())
798 raise original_ioerror
799 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
800 resp.msg = old_resp.msg
802 if resp.headers.get('Content-encoding', '') == 'deflate':
803 gz = io.BytesIO(self.deflate(resp.read()))
804 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
805 resp.msg = old_resp.msg
808 https_request = http_request
809 https_response = http_response
812 def parse_iso8601(date_str, delimiter='T'):
813 """ Return a UNIX timestamp from the given date """
819 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
822 timezone = datetime.timedelta()
824 date_str = date_str[:-len(m.group(0))]
825 if not m.group('sign'):
826 timezone = datetime.timedelta()
828 sign = 1 if m.group('sign') == '+' else -1
829 timezone = datetime.timedelta(
830 hours=sign * int(m.group('hours')),
831 minutes=sign * int(m.group('minutes')))
832 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
833 dt = datetime.datetime.strptime(date_str, date_format) - timezone
834 return calendar.timegm(dt.timetuple())
837 def unified_strdate(date_str):
838 """Return a string with the date in the format YYYYMMDD"""
845 date_str = date_str.replace(',', ' ')
846 # %z (UTC offset) is only supported in python>=3.2
847 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
848 format_expressions = [
853 '%b %dst %Y %I:%M%p',
854 '%b %dnd %Y %I:%M%p',
855 '%b %dth %Y %I:%M%p',
865 '%Y-%m-%dT%H:%M:%SZ',
866 '%Y-%m-%dT%H:%M:%S.%fZ',
867 '%Y-%m-%dT%H:%M:%S.%f0Z',
869 '%Y-%m-%dT%H:%M:%S.%f',
872 for expression in format_expressions:
874 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
877 if upload_date is None:
878 timetuple = email.utils.parsedate_tz(date_str)
880 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
883 def determine_ext(url, default_ext=u'unknown_video'):
886 guess = url.partition(u'?')[0].rpartition(u'.')[2]
887 if re.match(r'^[A-Za-z0-9]+$', guess):
892 def subtitles_filename(filename, sub_lang, sub_format):
893 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
895 def date_from_str(date_str):
897 Return a datetime object from a string in the format YYYYMMDD or
898 (now|today)[+-][0-9](day|week|month|year)(s)?"""
899 today = datetime.date.today()
900 if date_str == 'now'or date_str == 'today':
902 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
903 if match is not None:
904 sign = match.group('sign')
905 time = int(match.group('time'))
908 unit = match.group('unit')
917 delta = datetime.timedelta(**{unit: time})
919 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
921 def hyphenate_date(date_str):
923 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
924 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
925 if match is not None:
926 return '-'.join(match.groups())
930 class DateRange(object):
931 """Represents a time interval between two dates"""
932 def __init__(self, start=None, end=None):
933 """start and end must be strings in the format accepted by date"""
934 if start is not None:
935 self.start = date_from_str(start)
937 self.start = datetime.datetime.min.date()
939 self.end = date_from_str(end)
941 self.end = datetime.datetime.max.date()
942 if self.start > self.end:
943 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
946 """Returns a range that only contains the given day"""
948 def __contains__(self, date):
949 """Check if the date is in the range"""
950 if not isinstance(date, datetime.date):
951 date = date_from_str(date)
952 return self.start <= date <= self.end
954 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
958 """ Returns the platform name as a compat_str """
959 res = platform.platform()
960 if isinstance(res, bytes):
961 res = res.decode(preferredencoding())
963 assert isinstance(res, compat_str)
967 def _windows_write_string(s, out):
968 """ Returns True if the string was written using special methods,
969 False if it has yet to be written out."""
970 # Adapted from http://stackoverflow.com/a/3259271/35070
973 import ctypes.wintypes
981 fileno = out.fileno()
982 except AttributeError:
983 # If the output stream doesn't have a fileno, it's virtual
985 if fileno not in WIN_OUTPUT_IDS:
988 GetStdHandle = ctypes.WINFUNCTYPE(
989 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
990 ("GetStdHandle", ctypes.windll.kernel32))
991 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
993 WriteConsoleW = ctypes.WINFUNCTYPE(
994 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
995 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
996 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
997 written = ctypes.wintypes.DWORD(0)
999 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1000 FILE_TYPE_CHAR = 0x0002
1001 FILE_TYPE_REMOTE = 0x8000
1002 GetConsoleMode = ctypes.WINFUNCTYPE(
1003 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1004 ctypes.POINTER(ctypes.wintypes.DWORD))(
1005 ("GetConsoleMode", ctypes.windll.kernel32))
1006 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1008 def not_a_console(handle):
1009 if handle == INVALID_HANDLE_VALUE or handle is None:
1011 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1012 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1014 if not_a_console(h):
1017 def next_nonbmp_pos(s):
1019 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1020 except StopIteration:
1024 count = min(next_nonbmp_pos(s), 1024)
1026 ret = WriteConsoleW(
1027 h, s, count if count else 2, ctypes.byref(written), None)
1029 raise OSError('Failed to write string')
1030 if not count: # We just wrote a non-BMP character
1031 assert written.value == 2
1034 assert written.value > 0
1035 s = s[written.value:]
1039 def write_string(s, out=None, encoding=None):
1042 assert type(s) == compat_str
1044 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1045 if _windows_write_string(s, out):
1048 if ('b' in getattr(out, 'mode', '') or
1049 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1050 byt = s.encode(encoding or preferredencoding(), 'ignore')
1052 elif hasattr(out, 'buffer'):
1053 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1054 byt = s.encode(enc, 'ignore')
1055 out.buffer.write(byt)
1061 def bytes_to_intlist(bs):
1064 if isinstance(bs[0], int): # Python 3
1067 return [ord(c) for c in bs]
1070 def intlist_to_bytes(xs):
1073 if isinstance(chr(0), bytes): # Python 2
1074 return ''.join([chr(x) for x in xs])
1079 # Cross-platform file locking
1080 if sys.platform == 'win32':
1081 import ctypes.wintypes
1084 class OVERLAPPED(ctypes.Structure):
1086 ('Internal', ctypes.wintypes.LPVOID),
1087 ('InternalHigh', ctypes.wintypes.LPVOID),
1088 ('Offset', ctypes.wintypes.DWORD),
1089 ('OffsetHigh', ctypes.wintypes.DWORD),
1090 ('hEvent', ctypes.wintypes.HANDLE),
1093 kernel32 = ctypes.windll.kernel32
1094 LockFileEx = kernel32.LockFileEx
1095 LockFileEx.argtypes = [
1096 ctypes.wintypes.HANDLE, # hFile
1097 ctypes.wintypes.DWORD, # dwFlags
1098 ctypes.wintypes.DWORD, # dwReserved
1099 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1100 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1101 ctypes.POINTER(OVERLAPPED) # Overlapped
1103 LockFileEx.restype = ctypes.wintypes.BOOL
1104 UnlockFileEx = kernel32.UnlockFileEx
1105 UnlockFileEx.argtypes = [
1106 ctypes.wintypes.HANDLE, # hFile
1107 ctypes.wintypes.DWORD, # dwReserved
1108 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1109 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1110 ctypes.POINTER(OVERLAPPED) # Overlapped
1112 UnlockFileEx.restype = ctypes.wintypes.BOOL
1113 whole_low = 0xffffffff
1114 whole_high = 0x7fffffff
1116 def _lock_file(f, exclusive):
1117 overlapped = OVERLAPPED()
1118 overlapped.Offset = 0
1119 overlapped.OffsetHigh = 0
1120 overlapped.hEvent = 0
1121 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1122 handle = msvcrt.get_osfhandle(f.fileno())
1123 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1124 whole_low, whole_high, f._lock_file_overlapped_p):
1125 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1127 def _unlock_file(f):
1128 assert f._lock_file_overlapped_p
1129 handle = msvcrt.get_osfhandle(f.fileno())
1130 if not UnlockFileEx(handle, 0,
1131 whole_low, whole_high, f._lock_file_overlapped_p):
1132 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1137 def _lock_file(f, exclusive):
1138 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1140 def _unlock_file(f):
1141 fcntl.flock(f, fcntl.LOCK_UN)
1144 class locked_file(object):
1145 def __init__(self, filename, mode, encoding=None):
1146 assert mode in ['r', 'a', 'w']
1147 self.f = io.open(filename, mode, encoding=encoding)
1150 def __enter__(self):
1151 exclusive = self.mode != 'r'
1153 _lock_file(self.f, exclusive)
1159 def __exit__(self, etype, value, traceback):
1161 _unlock_file(self.f)
1168 def write(self, *args):
1169 return self.f.write(*args)
1171 def read(self, *args):
1172 return self.f.read(*args)
1175 def shell_quote(args):
1177 encoding = sys.getfilesystemencoding()
1178 if encoding is None:
1181 if isinstance(a, bytes):
1182 # We may get a filename encoded with 'encodeFilename'
1183 a = a.decode(encoding)
1184 quoted_args.append(pipes.quote(a))
1185 return u' '.join(quoted_args)
1188 def takewhile_inclusive(pred, seq):
1189 """ Like itertools.takewhile, but include the latest evaluated element
1190 (the first element so that Not pred(e)) """
1197 def smuggle_url(url, data):
1198 """ Pass additional data in a URL for internal use. """
1200 sdata = compat_urllib_parse.urlencode(
1201 {u'__youtubedl_smuggle': json.dumps(data)})
1202 return url + u'#' + sdata
1205 def unsmuggle_url(smug_url, default=None):
1206 if not '#__youtubedl_smuggle' in smug_url:
1207 return smug_url, default
1208 url, _, sdata = smug_url.rpartition(u'#')
1209 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1210 data = json.loads(jsond)
1214 def format_bytes(bytes):
1217 if type(bytes) is str:
1218 bytes = float(bytes)
1222 exponent = int(math.log(bytes, 1024.0))
1223 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1224 converted = float(bytes) / float(1024 ** exponent)
1225 return u'%.2f%s' % (converted, suffix)
1228 def get_term_width():
1229 columns = os.environ.get('COLUMNS', None)
1234 sp = subprocess.Popen(
1236 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1237 out, err = sp.communicate()
1238 return int(out.split()[1])
1244 def month_by_name(name):
1245 """ Return the number of a month by (locale-independently) English name """
1248 u'January', u'February', u'March', u'April', u'May', u'June',
1249 u'July', u'August', u'September', u'October', u'November', u'December']
1251 return ENGLISH_NAMES.index(name) + 1
1256 def fix_xml_ampersands(xml_str):
1257 """Replace all the '&' by '&' in XML"""
1259 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1264 def setproctitle(title):
1265 assert isinstance(title, compat_str)
1267 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1270 title_bytes = title.encode('utf-8')
1271 buf = ctypes.create_string_buffer(len(title_bytes))
1272 buf.value = title_bytes
1274 libc.prctl(15, buf, 0, 0, 0)
1275 except AttributeError:
1276 return # Strange libc, just skip this
1279 def remove_start(s, start):
1280 if s.startswith(start):
1281 return s[len(start):]
1285 def remove_end(s, end):
1287 return s[:-len(end)]
1291 def url_basename(url):
1292 path = compat_urlparse.urlparse(url).path
1293 return path.strip(u'/').split(u'/')[-1]
1296 class HEADRequest(compat_urllib_request.Request):
1297 def get_method(self):
1301 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1304 v = getattr(v, get_attr, None)
1307 return default if v is None else (int(v) * invscale // scale)
1310 def str_or_none(v, default=None):
1311 return default if v is None else compat_str(v)
1314 def str_to_int(int_str):
1315 """ A more relaxed version of int_or_none """
1318 int_str = re.sub(r'[,\.\+]', u'', int_str)
1322 def float_or_none(v, scale=1, invscale=1, default=None):
1323 return default if v is None else (float(v) * invscale / scale)
1326 def parse_duration(s):
1333 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1336 res = int(m.group('secs'))
1338 res += int(m.group('mins')) * 60
1339 if m.group('hours'):
1340 res += int(m.group('hours')) * 60 * 60
1342 res += float(m.group('ms'))
1346 def prepend_extension(filename, ext):
1347 name, real_ext = os.path.splitext(filename)
1348 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1351 def check_executable(exe, args=[]):
1352 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1353 args can be a list of arguments for a short output (like -version) """
1355 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1361 class PagedList(object):
1362 def __init__(self, pagefunc, pagesize):
1363 self._pagefunc = pagefunc
1364 self._pagesize = pagesize
1367 # This is only useful for tests
1368 return len(self.getslice())
1370 def getslice(self, start=0, end=None):
1372 for pagenum in itertools.count(start // self._pagesize):
1373 firstid = pagenum * self._pagesize
1374 nextfirstid = pagenum * self._pagesize + self._pagesize
1375 if start >= nextfirstid:
1378 page_results = list(self._pagefunc(pagenum))
1381 start % self._pagesize
1382 if firstid <= start < nextfirstid
1386 ((end - 1) % self._pagesize) + 1
1387 if (end is not None and firstid <= end <= nextfirstid)
1390 if startv != 0 or endv is not None:
1391 page_results = page_results[startv:endv]
1392 res.extend(page_results)
1394 # A little optimization - if current page is not "full", ie. does
1395 # not contain page_size videos then we can assume that this page
1396 # is the last one - there are no more ids on further pages -
1397 # i.e. no need to query again.
1398 if len(page_results) + startv < self._pagesize:
1401 # If we got the whole page, but the next page is not interesting,
1402 # break out early as well
1403 if end == nextfirstid:
1408 def uppercase_escape(s):
1409 unicode_escape = codecs.getdecoder('unicode_escape')
1411 r'\\U[0-9a-fA-F]{8}',
1412 lambda m: unicode_escape(m.group(0))[0],
1416 struct.pack(u'!I', 0)
1418 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1419 def struct_pack(spec, *args):
1420 if isinstance(spec, compat_str):
1421 spec = spec.encode('ascii')
1422 return struct.pack(spec, *args)
1424 def struct_unpack(spec, *args):
1425 if isinstance(spec, compat_str):
1426 spec = spec.encode('ascii')
1427 return struct.unpack(spec, *args)
1429 struct_pack = struct.pack
1430 struct_unpack = struct.unpack
1433 def read_batch_urls(batch_fd):
1435 if not isinstance(url, compat_str):
1436 url = url.decode('utf-8', 'replace')
1437 BOM_UTF8 = u'\xef\xbb\xbf'
1438 if url.startswith(BOM_UTF8):
1439 url = url[len(BOM_UTF8):]
1441 if url.startswith(('#', ';', ']')):
1445 with contextlib.closing(batch_fd) as fd:
1446 return [url for url in map(fixup, fd) if url]
1449 def urlencode_postdata(*args, **kargs):
1450 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1454 etree_iter = xml.etree.ElementTree.Element.iter
1455 except AttributeError: # Python <=2.6
1456 etree_iter = lambda n: n.findall('.//*')
1460 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1461 def doctype(self, name, pubid, system):
1462 pass # Ignore doctypes
1464 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1465 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1466 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1467 # Fix up XML parser in Python 2.x
1468 if sys.version_info < (3, 0):
1469 for n in etree_iter(tree):
1470 if n.text is not None:
1471 if not isinstance(n.text, compat_str):
1472 n.text = n.text.decode('utf-8')
1476 if sys.version_info < (3, 0) and sys.platform == 'win32':
1477 def compat_getpass(prompt, *args, **kwargs):
1478 if isinstance(prompt, compat_str):
1479 prompt = prompt.encode(preferredencoding())
1480 return getpass.getpass(prompt, *args, **kwargs)
1482 compat_getpass = getpass.getpass
1494 def strip_jsonp(code):
1495 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1498 def js_to_json(code):
1501 if key.startswith("'"):
1502 assert key.endswith("'")
1503 assert '"' not in key
1504 key = '"%s"' % key[1:-1]
1505 elif not key.startswith('"'):
1509 if value.startswith("'"):
1510 assert value.endswith("'")
1511 assert '"' not in value
1512 value = '"%s"' % value[1:-1]
1514 return m.group(1) + key + m.group(3) + value
1516 res = re.sub(r'''(?x)
1518 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1520 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1522 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1526 def qualities(quality_ids):
1527 """ Get a numeric quality value out of a list of possible values """
1530 return quality_ids.index(qid)
1536 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1539 subprocess_check_output = subprocess.check_output
1540 except AttributeError:
1541 def subprocess_check_output(*args, **kwargs):
1542 assert 'input' not in kwargs
1543 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1544 output, _ = p.communicate()
1547 raise subprocess.CalledProcessError(ret, p.args, output=output)