2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
295 replaced.append(c[0])
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
302 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
303 class BaseHTMLParser(compat_html_parser.HTMLParser):
305 compat_html_parser.HTMLParser.__init__(self)
308 def loads(self, html):
313 class AttrParser(BaseHTMLParser):
314 """Modified HTMLParser that isolates a tag with the specified attribute"""
315 def __init__(self, attribute, value):
316 self.attribute = attribute
321 self.watch_startpos = False
323 BaseHTMLParser.__init__(self)
325 def error(self, message):
326 if self.error_count > 10 or self.started:
327 raise compat_html_parser.HTMLParseError(message, self.getpos())
328 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
329 self.error_count += 1
332 def handle_starttag(self, tag, attrs):
335 self.find_startpos(None)
336 if self.attribute in attrs and attrs[self.attribute] == self.value:
339 self.watch_startpos = True
341 if not tag in self.depth: self.depth[tag] = 0
344 def handle_endtag(self, tag):
346 if tag in self.depth: self.depth[tag] -= 1
347 if self.depth[self.result[0]] == 0:
349 self.result.append(self.getpos())
351 def find_startpos(self, x):
352 """Needed to put the start position of the result (self.result[1])
353 after the opening tag with the requested id"""
354 if self.watch_startpos:
355 self.watch_startpos = False
356 self.result.append(self.getpos())
357 handle_entityref = handle_charref = handle_data = handle_comment = \
358 handle_decl = handle_pi = unknown_decl = find_startpos
360 def get_result(self):
361 if self.result is None:
363 if len(self.result) != 3:
365 lines = self.html.split('\n')
366 lines = lines[self.result[1][0]-1:self.result[2][0]]
367 lines[0] = lines[0][self.result[1][1]:]
369 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
370 lines[-1] = lines[-1][:self.result[2][1]]
371 return '\n'.join(lines).strip()
372 # Hack for https://github.com/rg3/youtube-dl/issues/662
373 if sys.version_info < (2, 7, 3):
374 AttrParser.parse_endtag = (lambda self, i:
375 i + len("</scr'+'ipt>")
376 if self.rawdata[i:].startswith("</scr'+'ipt>")
377 else compat_html_parser.HTMLParser.parse_endtag(self, i))
379 def get_element_by_id(id, html):
380 """Return the content of the tag with the specified ID in the passed HTML document"""
381 return get_element_by_attribute("id", id, html)
383 def get_element_by_attribute(attribute, value, html):
384 """Return the content of the tag with the specified attribute in the passed HTML document"""
385 parser = AttrParser(attribute, value)
388 except compat_html_parser.HTMLParseError:
390 return parser.get_result()
392 class MetaParser(BaseHTMLParser):
394 Modified HTMLParser that isolates a meta tag with the specified name
397 def __init__(self, name):
398 BaseHTMLParser.__init__(self)
403 def handle_starttag(self, tag, attrs):
407 if attrs.get('name') == self.name:
408 self.result = attrs.get('content')
410 def get_result(self):
413 def get_meta_content(name, html):
415 Return the content attribute from the meta tag with the given name attribute.
417 parser = MetaParser(name)
420 except compat_html_parser.HTMLParseError:
422 return parser.get_result()
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
428 html = html.replace('\n', ' ')
429 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
438 def sanitize_open(filename, open_mode):
439 """Try to open the given filename, and slightly tweak it if this fails.
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
446 It returns the tuple (stream, definitive_file_name).
450 if sys.platform == 'win32':
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
457 if err.errno in (errno.EACCES,):
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = os.path.join(
462 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
463 for path_part in os.path.split(filename)
465 if alt_filename == filename:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(filename), open_mode)
470 return (stream, alt_filename)
473 def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
481 def sanitize_filename(s, restricted=False, is_id=False):
482 """Sanitizes a string so it could be used as part of a filename.
483 If restricted is set, use a stricter subset of allowed characters.
484 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
486 def replace_insane(char):
487 if char == '?' or ord(char) < 32 or ord(char) == 127:
490 return '' if restricted else '\''
492 return '_-' if restricted else ' -'
493 elif char in '\\/|*<>':
495 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
497 if restricted and ord(char) > 127:
501 result = u''.join(map(replace_insane, s))
503 while '__' in result:
504 result = result.replace('__', '_')
505 result = result.strip('_')
506 # Common case of "Foreign band name - English song title"
507 if restricted and result.startswith('-_'):
513 def orderedSet(iterable):
514 """ Remove all duplicates from the input iterable """
522 def _htmlentity_transform(entity):
523 """Transforms an HTML entity to a character."""
524 # Known non-numeric HTML entity
525 if entity in compat_html_entities.name2codepoint:
526 return compat_chr(compat_html_entities.name2codepoint[entity])
528 mobj = re.match(r'#(x?[0-9]+)', entity)
530 numstr = mobj.group(1)
531 if numstr.startswith(u'x'):
533 numstr = u'0%s' % numstr
536 return compat_chr(int(numstr, base))
538 # Unknown entity in name, return its literal representation
539 return (u'&%s;' % entity)
545 assert type(s) == compat_str
548 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
551 def encodeFilename(s, for_subprocess=False):
553 @param s The name of the file
556 assert type(s) == compat_str
558 # Python 3 has a Unicode API
559 if sys.version_info >= (3, 0):
562 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
563 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
564 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
565 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
566 if not for_subprocess:
569 # For subprocess calls, encode with locale encoding
570 # Refer to http://stackoverflow.com/a/9951851/35070
571 encoding = preferredencoding()
573 encoding = sys.getfilesystemencoding()
576 return s.encode(encoding, 'ignore')
579 def encodeArgument(s):
580 if not isinstance(s, compat_str):
581 # Legacy code that uses byte strings
582 # Uncomment the following line after fixing all post processors
583 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
584 s = s.decode('ascii')
585 return encodeFilename(s, True)
588 def decodeOption(optval):
591 if isinstance(optval, bytes):
592 optval = optval.decode(preferredencoding())
594 assert isinstance(optval, compat_str)
597 def formatSeconds(secs):
599 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
601 return '%d:%02d' % (secs // 60, secs % 60)
606 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
607 if sys.version_info < (3, 2):
610 class HTTPSConnectionV3(httplib.HTTPSConnection):
611 def __init__(self, *args, **kwargs):
612 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
615 sock = socket.create_connection((self.host, self.port), self.timeout)
616 if getattr(self, '_tunnel_host', False):
620 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
622 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
624 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
625 def https_open(self, req):
626 return self.do_open(HTTPSConnectionV3, req)
627 return HTTPSHandlerV3(**kwargs)
629 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
630 context.verify_mode = (ssl.CERT_NONE
631 if opts_no_check_certificate
632 else ssl.CERT_REQUIRED)
633 context.set_default_verify_paths()
635 context.load_default_certs()
636 except AttributeError:
638 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
640 class ExtractorError(Exception):
641 """Error during info extraction."""
642 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
643 """ tb, if given, is the original traceback (so that it can be printed out).
644 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
647 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
649 if video_id is not None:
650 msg = video_id + ': ' + msg
652 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
653 super(ExtractorError, self).__init__(msg)
656 self.exc_info = sys.exc_info() # preserve original exception
658 self.video_id = video_id
660 def format_traceback(self):
661 if self.traceback is None:
663 return u''.join(traceback.format_tb(self.traceback))
666 class RegexNotFoundError(ExtractorError):
667 """Error when a regex didn't match"""
671 class DownloadError(Exception):
672 """Download Error exception.
674 This exception may be thrown by FileDownloader objects if they are not
675 configured to continue on errors. They will contain the appropriate
678 def __init__(self, msg, exc_info=None):
679 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
680 super(DownloadError, self).__init__(msg)
681 self.exc_info = exc_info
684 class SameFileError(Exception):
685 """Same File exception.
687 This exception will be thrown by FileDownloader objects if they detect
688 multiple files would have to be downloaded to the same file on disk.
693 class PostProcessingError(Exception):
694 """Post Processing exception.
696 This exception may be raised by PostProcessor's .run() method to
697 indicate an error in the postprocessing task.
699 def __init__(self, msg):
702 class MaxDownloadsReached(Exception):
703 """ --max-downloads limit has been reached. """
707 class UnavailableVideoError(Exception):
708 """Unavailable Format exception.
710 This exception will be thrown when a video is requested
711 in a format that is not available for that video.
716 class ContentTooShortError(Exception):
717 """Content Too Short exception.
719 This exception may be raised by FileDownloader objects when a file they
720 download is too small for what the server announced first, indicating
721 the connection was probably interrupted.
727 def __init__(self, downloaded, expected):
728 self.downloaded = downloaded
729 self.expected = expected
731 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
732 """Handler for HTTP requests and responses.
734 This class, when installed with an OpenerDirector, automatically adds
735 the standard headers to every HTTP request and handles gzipped and
736 deflated responses from web servers. If compression is to be avoided in
737 a particular request, the original request in the program code only has
738 to include the HTTP header "Youtubedl-No-Compression", which will be
739 removed before making the real request.
741 Part of this code was copied from:
743 http://techknack.net/python-urllib2-handlers/
745 Andrew Rowls, the author of that code, agreed to release it to the
752 return zlib.decompress(data, -zlib.MAX_WBITS)
754 return zlib.decompress(data)
757 def addinfourl_wrapper(stream, headers, url, code):
758 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
759 return compat_urllib_request.addinfourl(stream, headers, url, code)
760 ret = compat_urllib_request.addinfourl(stream, headers, url)
764 def http_request(self, req):
765 for h, v in std_headers.items():
766 if h not in req.headers:
768 if 'Youtubedl-no-compression' in req.headers:
769 if 'Accept-encoding' in req.headers:
770 del req.headers['Accept-encoding']
771 del req.headers['Youtubedl-no-compression']
772 if 'Youtubedl-user-agent' in req.headers:
773 if 'User-agent' in req.headers:
774 del req.headers['User-agent']
775 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
776 del req.headers['Youtubedl-user-agent']
779 def http_response(self, req, resp):
782 if resp.headers.get('Content-encoding', '') == 'gzip':
783 content = resp.read()
784 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
786 uncompressed = io.BytesIO(gz.read())
787 except IOError as original_ioerror:
788 # There may be junk add the end of the file
789 # See http://stackoverflow.com/q/4928560/35070 for details
790 for i in range(1, 1024):
792 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
793 uncompressed = io.BytesIO(gz.read())
798 raise original_ioerror
799 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
800 resp.msg = old_resp.msg
802 if resp.headers.get('Content-encoding', '') == 'deflate':
803 gz = io.BytesIO(self.deflate(resp.read()))
804 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
805 resp.msg = old_resp.msg
808 https_request = http_request
809 https_response = http_response
812 def parse_iso8601(date_str, delimiter='T'):
813 """ Return a UNIX timestamp from the given date """
819 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
822 timezone = datetime.timedelta()
824 date_str = date_str[:-len(m.group(0))]
825 if not m.group('sign'):
826 timezone = datetime.timedelta()
828 sign = 1 if m.group('sign') == '+' else -1
829 timezone = datetime.timedelta(
830 hours=sign * int(m.group('hours')),
831 minutes=sign * int(m.group('minutes')))
832 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
833 dt = datetime.datetime.strptime(date_str, date_format) - timezone
834 return calendar.timegm(dt.timetuple())
837 def unified_strdate(date_str):
838 """Return a string with the date in the format YYYYMMDD"""
845 date_str = date_str.replace(',', ' ')
846 # %z (UTC offset) is only supported in python>=3.2
847 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
848 format_expressions = [
853 '%b %dst %Y %I:%M%p',
854 '%b %dnd %Y %I:%M%p',
855 '%b %dth %Y %I:%M%p',
865 '%Y-%m-%dT%H:%M:%SZ',
866 '%Y-%m-%dT%H:%M:%S.%fZ',
867 '%Y-%m-%dT%H:%M:%S.%f0Z',
869 '%Y-%m-%dT%H:%M:%S.%f',
872 for expression in format_expressions:
874 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
877 if upload_date is None:
878 timetuple = email.utils.parsedate_tz(date_str)
880 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
883 def determine_ext(url, default_ext=u'unknown_video'):
886 guess = url.partition(u'?')[0].rpartition(u'.')[2]
887 if re.match(r'^[A-Za-z0-9]+$', guess):
892 def subtitles_filename(filename, sub_lang, sub_format):
893 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
895 def date_from_str(date_str):
897 Return a datetime object from a string in the format YYYYMMDD or
898 (now|today)[+-][0-9](day|week|month|year)(s)?"""
899 today = datetime.date.today()
900 if date_str == 'now'or date_str == 'today':
902 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
903 if match is not None:
904 sign = match.group('sign')
905 time = int(match.group('time'))
908 unit = match.group('unit')
917 delta = datetime.timedelta(**{unit: time})
919 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
921 def hyphenate_date(date_str):
923 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
924 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
925 if match is not None:
926 return '-'.join(match.groups())
930 class DateRange(object):
931 """Represents a time interval between two dates"""
932 def __init__(self, start=None, end=None):
933 """start and end must be strings in the format accepted by date"""
934 if start is not None:
935 self.start = date_from_str(start)
937 self.start = datetime.datetime.min.date()
939 self.end = date_from_str(end)
941 self.end = datetime.datetime.max.date()
942 if self.start > self.end:
943 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
946 """Returns a range that only contains the given day"""
948 def __contains__(self, date):
949 """Check if the date is in the range"""
950 if not isinstance(date, datetime.date):
951 date = date_from_str(date)
952 return self.start <= date <= self.end
954 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
958 """ Returns the platform name as a compat_str """
959 res = platform.platform()
960 if isinstance(res, bytes):
961 res = res.decode(preferredencoding())
963 assert isinstance(res, compat_str)
967 def _windows_write_string(s, out):
968 """ Returns True if the string was written using special methods,
969 False if it has yet to be written out."""
970 # Adapted from http://stackoverflow.com/a/3259271/35070
973 import ctypes.wintypes
981 fileno = out.fileno()
982 except AttributeError:
983 # If the output stream doesn't have a fileno, it's virtual
985 if fileno not in WIN_OUTPUT_IDS:
988 GetStdHandle = ctypes.WINFUNCTYPE(
989 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
990 ("GetStdHandle", ctypes.windll.kernel32))
991 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
993 WriteConsoleW = ctypes.WINFUNCTYPE(
994 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
995 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
996 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
997 written = ctypes.wintypes.DWORD(0)
999 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1000 FILE_TYPE_CHAR = 0x0002
1001 FILE_TYPE_REMOTE = 0x8000
1002 GetConsoleMode = ctypes.WINFUNCTYPE(
1003 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1004 ctypes.POINTER(ctypes.wintypes.DWORD))(
1005 ("GetConsoleMode", ctypes.windll.kernel32))
1006 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1008 def not_a_console(handle):
1009 if handle == INVALID_HANDLE_VALUE or handle is None:
1011 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1012 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1014 if not_a_console(h):
1017 def next_nonbmp_pos(s):
1019 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1020 except StopIteration:
1024 count = min(next_nonbmp_pos(s), 1024)
1026 ret = WriteConsoleW(
1027 h, s, count if count else 2, ctypes.byref(written), None)
1029 raise OSError('Failed to write string')
1030 if not count: # We just wrote a non-BMP character
1031 assert written.value == 2
1034 assert written.value > 0
1035 s = s[written.value:]
1039 def write_string(s, out=None, encoding=None):
1042 assert type(s) == compat_str
1044 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1045 if _windows_write_string(s, out):
1048 if ('b' in getattr(out, 'mode', '') or
1049 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1050 byt = s.encode(encoding or preferredencoding(), 'ignore')
1052 elif hasattr(out, 'buffer'):
1053 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1054 byt = s.encode(enc, 'ignore')
1055 out.buffer.write(byt)
1061 def bytes_to_intlist(bs):
1064 if isinstance(bs[0], int): # Python 3
1067 return [ord(c) for c in bs]
1070 def intlist_to_bytes(xs):
1073 if isinstance(chr(0), bytes): # Python 2
1074 return ''.join([chr(x) for x in xs])
1079 def get_cachedir(params={}):
1080 cache_root = os.environ.get('XDG_CACHE_HOME',
1081 os.path.expanduser('~/.cache'))
1082 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1085 # Cross-platform file locking
1086 if sys.platform == 'win32':
1087 import ctypes.wintypes
1090 class OVERLAPPED(ctypes.Structure):
1092 ('Internal', ctypes.wintypes.LPVOID),
1093 ('InternalHigh', ctypes.wintypes.LPVOID),
1094 ('Offset', ctypes.wintypes.DWORD),
1095 ('OffsetHigh', ctypes.wintypes.DWORD),
1096 ('hEvent', ctypes.wintypes.HANDLE),
1099 kernel32 = ctypes.windll.kernel32
1100 LockFileEx = kernel32.LockFileEx
1101 LockFileEx.argtypes = [
1102 ctypes.wintypes.HANDLE, # hFile
1103 ctypes.wintypes.DWORD, # dwFlags
1104 ctypes.wintypes.DWORD, # dwReserved
1105 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1106 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1107 ctypes.POINTER(OVERLAPPED) # Overlapped
1109 LockFileEx.restype = ctypes.wintypes.BOOL
1110 UnlockFileEx = kernel32.UnlockFileEx
1111 UnlockFileEx.argtypes = [
1112 ctypes.wintypes.HANDLE, # hFile
1113 ctypes.wintypes.DWORD, # dwReserved
1114 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1115 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1116 ctypes.POINTER(OVERLAPPED) # Overlapped
1118 UnlockFileEx.restype = ctypes.wintypes.BOOL
1119 whole_low = 0xffffffff
1120 whole_high = 0x7fffffff
1122 def _lock_file(f, exclusive):
1123 overlapped = OVERLAPPED()
1124 overlapped.Offset = 0
1125 overlapped.OffsetHigh = 0
1126 overlapped.hEvent = 0
1127 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1128 handle = msvcrt.get_osfhandle(f.fileno())
1129 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1130 whole_low, whole_high, f._lock_file_overlapped_p):
1131 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1133 def _unlock_file(f):
1134 assert f._lock_file_overlapped_p
1135 handle = msvcrt.get_osfhandle(f.fileno())
1136 if not UnlockFileEx(handle, 0,
1137 whole_low, whole_high, f._lock_file_overlapped_p):
1138 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1143 def _lock_file(f, exclusive):
1144 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1146 def _unlock_file(f):
1147 fcntl.flock(f, fcntl.LOCK_UN)
1150 class locked_file(object):
1151 def __init__(self, filename, mode, encoding=None):
1152 assert mode in ['r', 'a', 'w']
1153 self.f = io.open(filename, mode, encoding=encoding)
1156 def __enter__(self):
1157 exclusive = self.mode != 'r'
1159 _lock_file(self.f, exclusive)
1165 def __exit__(self, etype, value, traceback):
1167 _unlock_file(self.f)
1174 def write(self, *args):
1175 return self.f.write(*args)
1177 def read(self, *args):
1178 return self.f.read(*args)
1181 def shell_quote(args):
1183 encoding = sys.getfilesystemencoding()
1184 if encoding is None:
1187 if isinstance(a, bytes):
1188 # We may get a filename encoded with 'encodeFilename'
1189 a = a.decode(encoding)
1190 quoted_args.append(pipes.quote(a))
1191 return u' '.join(quoted_args)
1194 def takewhile_inclusive(pred, seq):
1195 """ Like itertools.takewhile, but include the latest evaluated element
1196 (the first element so that Not pred(e)) """
1203 def smuggle_url(url, data):
1204 """ Pass additional data in a URL for internal use. """
1206 sdata = compat_urllib_parse.urlencode(
1207 {u'__youtubedl_smuggle': json.dumps(data)})
1208 return url + u'#' + sdata
1211 def unsmuggle_url(smug_url, default=None):
1212 if not '#__youtubedl_smuggle' in smug_url:
1213 return smug_url, default
1214 url, _, sdata = smug_url.rpartition(u'#')
1215 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1216 data = json.loads(jsond)
1220 def format_bytes(bytes):
1223 if type(bytes) is str:
1224 bytes = float(bytes)
1228 exponent = int(math.log(bytes, 1024.0))
1229 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1230 converted = float(bytes) / float(1024 ** exponent)
1231 return u'%.2f%s' % (converted, suffix)
1234 def get_term_width():
1235 columns = os.environ.get('COLUMNS', None)
1240 sp = subprocess.Popen(
1242 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1243 out, err = sp.communicate()
1244 return int(out.split()[1])
1250 def month_by_name(name):
1251 """ Return the number of a month by (locale-independently) English name """
1254 u'January', u'February', u'March', u'April', u'May', u'June',
1255 u'July', u'August', u'September', u'October', u'November', u'December']
1257 return ENGLISH_NAMES.index(name) + 1
1262 def fix_xml_ampersands(xml_str):
1263 """Replace all the '&' by '&' in XML"""
1265 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1270 def setproctitle(title):
1271 assert isinstance(title, compat_str)
1273 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1276 title_bytes = title.encode('utf-8')
1277 buf = ctypes.create_string_buffer(len(title_bytes))
1278 buf.value = title_bytes
1280 libc.prctl(15, buf, 0, 0, 0)
1281 except AttributeError:
1282 return # Strange libc, just skip this
1285 def remove_start(s, start):
1286 if s.startswith(start):
1287 return s[len(start):]
1291 def remove_end(s, end):
1293 return s[:-len(end)]
1297 def url_basename(url):
1298 path = compat_urlparse.urlparse(url).path
1299 return path.strip(u'/').split(u'/')[-1]
1302 class HEADRequest(compat_urllib_request.Request):
1303 def get_method(self):
1307 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1310 v = getattr(v, get_attr, None)
1313 return default if v is None else (int(v) * invscale // scale)
1316 def str_or_none(v, default=None):
1317 return default if v is None else compat_str(v)
1320 def str_to_int(int_str):
1321 """ A more relaxed version of int_or_none """
1324 int_str = re.sub(r'[,\.]', u'', int_str)
1328 def float_or_none(v, scale=1, invscale=1, default=None):
1329 return default if v is None else (float(v) * invscale / scale)
1332 def parse_duration(s):
1339 r'(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1342 res = int(m.group('secs'))
1344 res += int(m.group('mins')) * 60
1345 if m.group('hours'):
1346 res += int(m.group('hours')) * 60 * 60
1348 res += float(m.group('ms'))
1352 def prepend_extension(filename, ext):
1353 name, real_ext = os.path.splitext(filename)
1354 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1357 def check_executable(exe, args=[]):
1358 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359 args can be a list of arguments for a short output (like -version) """
1361 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1367 class PagedList(object):
1368 def __init__(self, pagefunc, pagesize):
1369 self._pagefunc = pagefunc
1370 self._pagesize = pagesize
1373 # This is only useful for tests
1374 return len(self.getslice())
1376 def getslice(self, start=0, end=None):
1378 for pagenum in itertools.count(start // self._pagesize):
1379 firstid = pagenum * self._pagesize
1380 nextfirstid = pagenum * self._pagesize + self._pagesize
1381 if start >= nextfirstid:
1384 page_results = list(self._pagefunc(pagenum))
1387 start % self._pagesize
1388 if firstid <= start < nextfirstid
1392 ((end - 1) % self._pagesize) + 1
1393 if (end is not None and firstid <= end <= nextfirstid)
1396 if startv != 0 or endv is not None:
1397 page_results = page_results[startv:endv]
1398 res.extend(page_results)
1400 # A little optimization - if current page is not "full", ie. does
1401 # not contain page_size videos then we can assume that this page
1402 # is the last one - there are no more ids on further pages -
1403 # i.e. no need to query again.
1404 if len(page_results) + startv < self._pagesize:
1407 # If we got the whole page, but the next page is not interesting,
1408 # break out early as well
1409 if end == nextfirstid:
1414 def uppercase_escape(s):
1415 unicode_escape = codecs.getdecoder('unicode_escape')
1417 r'\\U[0-9a-fA-F]{8}',
1418 lambda m: unicode_escape(m.group(0))[0],
1422 struct.pack(u'!I', 0)
1424 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1425 def struct_pack(spec, *args):
1426 if isinstance(spec, compat_str):
1427 spec = spec.encode('ascii')
1428 return struct.pack(spec, *args)
1430 def struct_unpack(spec, *args):
1431 if isinstance(spec, compat_str):
1432 spec = spec.encode('ascii')
1433 return struct.unpack(spec, *args)
1435 struct_pack = struct.pack
1436 struct_unpack = struct.unpack
1439 def read_batch_urls(batch_fd):
1441 if not isinstance(url, compat_str):
1442 url = url.decode('utf-8', 'replace')
1443 BOM_UTF8 = u'\xef\xbb\xbf'
1444 if url.startswith(BOM_UTF8):
1445 url = url[len(BOM_UTF8):]
1447 if url.startswith(('#', ';', ']')):
1451 with contextlib.closing(batch_fd) as fd:
1452 return [url for url in map(fixup, fd) if url]
1455 def urlencode_postdata(*args, **kargs):
1456 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1460 etree_iter = xml.etree.ElementTree.Element.iter
1461 except AttributeError: # Python <=2.6
1462 etree_iter = lambda n: n.findall('.//*')
1466 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1467 def doctype(self, name, pubid, system):
1468 pass # Ignore doctypes
1470 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1471 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1472 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1473 # Fix up XML parser in Python 2.x
1474 if sys.version_info < (3, 0):
1475 for n in etree_iter(tree):
1476 if n.text is not None:
1477 if not isinstance(n.text, compat_str):
1478 n.text = n.text.decode('utf-8')
1482 if sys.version_info < (3, 0) and sys.platform == 'win32':
1483 def compat_getpass(prompt, *args, **kwargs):
1484 if isinstance(prompt, compat_str):
1485 prompt = prompt.encode(preferredencoding())
1486 return getpass.getpass(prompt, *args, **kwargs)
1488 compat_getpass = getpass.getpass
1500 def strip_jsonp(code):
1501 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1504 def js_to_json(code):
1507 if key.startswith("'"):
1508 assert key.endswith("'")
1509 assert '"' not in key
1510 key = '"%s"' % key[1:-1]
1511 elif not key.startswith('"'):
1515 if value.startswith("'"):
1516 assert value.endswith("'")
1517 assert '"' not in value
1518 value = '"%s"' % value[1:-1]
1520 return m.group(1) + key + m.group(3) + value
1522 res = re.sub(r'''(?x)
1524 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1526 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1528 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1532 def qualities(quality_ids):
1533 """ Get a numeric quality value out of a list of possible values """
1536 return quality_ids.index(qid)
1542 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1545 subprocess_check_output = subprocess.check_output
1546 except AttributeError:
1547 def subprocess_check_output(*args, **kwargs):
1548 assert 'input' not in kwargs
1549 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1550 output, _ = p.communicate()
1553 raise subprocess.CalledProcessError(ret, p.args, output=output)