2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 if type(c) is int: return c
199 # This is not clearly defined otherwise
200 compiled_regex_type = type(re.compile(''))
203 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
204 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
205 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
206 'Accept-Encoding': 'gzip, deflate',
207 'Accept-Language': 'en-us,en;q=0.5',
210 def preferredencoding():
211 """Get preferred encoding.
213 Returns the best encoding scheme for the system, based on
214 locale.getpreferredencoding() and some further tweaks.
217 pref = locale.getpreferredencoding()
224 if sys.version_info < (3,0):
226 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
229 assert type(s) == type(u'')
233 def write_json_file(obj, fn):
234 """ Encode obj as JSON and write it to fn, atomically """
238 'prefix': os.path.basename(fn) + '.',
239 'dir': os.path.dirname(fn),
243 # In Python 2.x, json.dump expects a bytestream.
244 # In Python 3.x, it writes to a character stream
245 if sys.version_info < (3, 0):
253 tf = tempfile.NamedTemporaryFile(**args)
258 os.rename(tf.name, fn)
267 if sys.version_info >= (2, 7):
268 def find_xpath_attr(node, xpath, key, val):
269 """ Find the xpath xpath[@key=val] """
270 assert re.match(r'^[a-zA-Z-]+$', key)
271 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
272 expr = xpath + u"[@%s='%s']" % (key, val)
273 return node.find(expr)
275 def find_xpath_attr(node, xpath, key, val):
276 for f in node.findall(xpath):
277 if f.attrib.get(key) == val:
281 # On python2.6 the xml.etree.ElementTree.Element methods don't support
282 # the namespace parameter
283 def xpath_with_ns(path, ns_map):
284 components = [c.split(':') for c in path.split('/')]
288 replaced.append(c[0])
291 replaced.append('{%s}%s' % (ns_map[ns], tag))
292 return '/'.join(replaced)
294 def htmlentity_transform(matchobj):
295 """Transforms an HTML entity to a character.
297 This function receives a match object and is intended to be used with
298 the re.sub() function.
300 entity = matchobj.group(1)
302 # Known non-numeric HTML entity
303 if entity in compat_html_entities.name2codepoint:
304 return compat_chr(compat_html_entities.name2codepoint[entity])
306 mobj = re.match(u'(?u)#(x?\\d+)', entity)
308 numstr = mobj.group(1)
309 if numstr.startswith(u'x'):
311 numstr = u'0%s' % numstr
314 return compat_chr(int(numstr, base))
316 # Unknown entity in name, return its literal representation
317 return (u'&%s;' % entity)
319 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
320 class BaseHTMLParser(compat_html_parser.HTMLParser):
322 compat_html_parser.HTMLParser.__init__(self)
325 def loads(self, html):
330 class AttrParser(BaseHTMLParser):
331 """Modified HTMLParser that isolates a tag with the specified attribute"""
332 def __init__(self, attribute, value):
333 self.attribute = attribute
338 self.watch_startpos = False
340 BaseHTMLParser.__init__(self)
342 def error(self, message):
343 if self.error_count > 10 or self.started:
344 raise compat_html_parser.HTMLParseError(message, self.getpos())
345 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
346 self.error_count += 1
349 def handle_starttag(self, tag, attrs):
352 self.find_startpos(None)
353 if self.attribute in attrs and attrs[self.attribute] == self.value:
356 self.watch_startpos = True
358 if not tag in self.depth: self.depth[tag] = 0
361 def handle_endtag(self, tag):
363 if tag in self.depth: self.depth[tag] -= 1
364 if self.depth[self.result[0]] == 0:
366 self.result.append(self.getpos())
368 def find_startpos(self, x):
369 """Needed to put the start position of the result (self.result[1])
370 after the opening tag with the requested id"""
371 if self.watch_startpos:
372 self.watch_startpos = False
373 self.result.append(self.getpos())
374 handle_entityref = handle_charref = handle_data = handle_comment = \
375 handle_decl = handle_pi = unknown_decl = find_startpos
377 def get_result(self):
378 if self.result is None:
380 if len(self.result) != 3:
382 lines = self.html.split('\n')
383 lines = lines[self.result[1][0]-1:self.result[2][0]]
384 lines[0] = lines[0][self.result[1][1]:]
386 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
387 lines[-1] = lines[-1][:self.result[2][1]]
388 return '\n'.join(lines).strip()
389 # Hack for https://github.com/rg3/youtube-dl/issues/662
390 if sys.version_info < (2, 7, 3):
391 AttrParser.parse_endtag = (lambda self, i:
392 i + len("</scr'+'ipt>")
393 if self.rawdata[i:].startswith("</scr'+'ipt>")
394 else compat_html_parser.HTMLParser.parse_endtag(self, i))
396 def get_element_by_id(id, html):
397 """Return the content of the tag with the specified ID in the passed HTML document"""
398 return get_element_by_attribute("id", id, html)
400 def get_element_by_attribute(attribute, value, html):
401 """Return the content of the tag with the specified attribute in the passed HTML document"""
402 parser = AttrParser(attribute, value)
405 except compat_html_parser.HTMLParseError:
407 return parser.get_result()
409 class MetaParser(BaseHTMLParser):
411 Modified HTMLParser that isolates a meta tag with the specified name
414 def __init__(self, name):
415 BaseHTMLParser.__init__(self)
420 def handle_starttag(self, tag, attrs):
424 if attrs.get('name') == self.name:
425 self.result = attrs.get('content')
427 def get_result(self):
430 def get_meta_content(name, html):
432 Return the content attribute from the meta tag with the given name attribute.
434 parser = MetaParser(name)
437 except compat_html_parser.HTMLParseError:
439 return parser.get_result()
442 def clean_html(html):
443 """Clean an HTML snippet into a readable string"""
445 html = html.replace('\n', ' ')
446 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
447 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
449 html = re.sub('<.*?>', '', html)
450 # Replace html entities
451 html = unescapeHTML(html)
455 def sanitize_open(filename, open_mode):
456 """Try to open the given filename, and slightly tweak it if this fails.
458 Attempts to open the given filename. If this fails, it tries to change
459 the filename slightly, step by step, until it's either able to open it
460 or it fails and raises a final exception, like the standard open()
463 It returns the tuple (stream, definitive_file_name).
467 if sys.platform == 'win32':
469 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
470 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
471 stream = open(encodeFilename(filename), open_mode)
472 return (stream, filename)
473 except (IOError, OSError) as err:
474 if err.errno in (errno.EACCES,):
477 # In case of error, try to remove win32 forbidden chars
478 alt_filename = os.path.join(
479 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
480 for path_part in os.path.split(filename)
482 if alt_filename == filename:
485 # An exception here should be caught in the caller
486 stream = open(encodeFilename(filename), open_mode)
487 return (stream, alt_filename)
490 def timeconvert(timestr):
491 """Convert RFC 2822 defined time string into system timestamp"""
493 timetuple = email.utils.parsedate_tz(timestr)
494 if timetuple is not None:
495 timestamp = email.utils.mktime_tz(timetuple)
498 def sanitize_filename(s, restricted=False, is_id=False):
499 """Sanitizes a string so it could be used as part of a filename.
500 If restricted is set, use a stricter subset of allowed characters.
501 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
503 def replace_insane(char):
504 if char == '?' or ord(char) < 32 or ord(char) == 127:
507 return '' if restricted else '\''
509 return '_-' if restricted else ' -'
510 elif char in '\\/|*<>':
512 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
514 if restricted and ord(char) > 127:
518 result = u''.join(map(replace_insane, s))
520 while '__' in result:
521 result = result.replace('__', '_')
522 result = result.strip('_')
523 # Common case of "Foreign band name - English song title"
524 if restricted and result.startswith('-_'):
530 def orderedSet(iterable):
531 """ Remove all duplicates from the input iterable """
542 assert type(s) == compat_str
544 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
548 def encodeFilename(s, for_subprocess=False):
550 @param s The name of the file
553 assert type(s) == compat_str
555 # Python 3 has a Unicode API
556 if sys.version_info >= (3, 0):
559 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
560 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
561 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
562 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
563 if not for_subprocess:
566 # For subprocess calls, encode with locale encoding
567 # Refer to http://stackoverflow.com/a/9951851/35070
568 encoding = preferredencoding()
570 encoding = sys.getfilesystemencoding()
573 return s.encode(encoding, 'ignore')
576 def encodeArgument(s):
577 if not isinstance(s, compat_str):
578 # Legacy code that uses byte strings
579 # Uncomment the following line after fixing all post processors
580 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
581 s = s.decode('ascii')
582 return encodeFilename(s, True)
585 def decodeOption(optval):
588 if isinstance(optval, bytes):
589 optval = optval.decode(preferredencoding())
591 assert isinstance(optval, compat_str)
594 def formatSeconds(secs):
596 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
598 return '%d:%02d' % (secs // 60, secs % 60)
603 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
604 if sys.version_info < (3, 2):
607 class HTTPSConnectionV3(httplib.HTTPSConnection):
608 def __init__(self, *args, **kwargs):
609 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
612 sock = socket.create_connection((self.host, self.port), self.timeout)
613 if getattr(self, '_tunnel_host', False):
617 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
619 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
621 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
622 def https_open(self, req):
623 return self.do_open(HTTPSConnectionV3, req)
624 return HTTPSHandlerV3(**kwargs)
626 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
627 context.verify_mode = (ssl.CERT_NONE
628 if opts_no_check_certificate
629 else ssl.CERT_REQUIRED)
630 context.set_default_verify_paths()
632 context.load_default_certs()
633 except AttributeError:
635 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
637 class ExtractorError(Exception):
638 """Error during info extraction."""
639 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
640 """ tb, if given, is the original traceback (so that it can be printed out).
641 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
644 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
646 if video_id is not None:
647 msg = video_id + ': ' + msg
649 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
650 super(ExtractorError, self).__init__(msg)
653 self.exc_info = sys.exc_info() # preserve original exception
655 self.video_id = video_id
657 def format_traceback(self):
658 if self.traceback is None:
660 return u''.join(traceback.format_tb(self.traceback))
663 class RegexNotFoundError(ExtractorError):
664 """Error when a regex didn't match"""
668 class DownloadError(Exception):
669 """Download Error exception.
671 This exception may be thrown by FileDownloader objects if they are not
672 configured to continue on errors. They will contain the appropriate
675 def __init__(self, msg, exc_info=None):
676 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
677 super(DownloadError, self).__init__(msg)
678 self.exc_info = exc_info
681 class SameFileError(Exception):
682 """Same File exception.
684 This exception will be thrown by FileDownloader objects if they detect
685 multiple files would have to be downloaded to the same file on disk.
690 class PostProcessingError(Exception):
691 """Post Processing exception.
693 This exception may be raised by PostProcessor's .run() method to
694 indicate an error in the postprocessing task.
696 def __init__(self, msg):
699 class MaxDownloadsReached(Exception):
700 """ --max-downloads limit has been reached. """
704 class UnavailableVideoError(Exception):
705 """Unavailable Format exception.
707 This exception will be thrown when a video is requested
708 in a format that is not available for that video.
713 class ContentTooShortError(Exception):
714 """Content Too Short exception.
716 This exception may be raised by FileDownloader objects when a file they
717 download is too small for what the server announced first, indicating
718 the connection was probably interrupted.
724 def __init__(self, downloaded, expected):
725 self.downloaded = downloaded
726 self.expected = expected
728 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
729 """Handler for HTTP requests and responses.
731 This class, when installed with an OpenerDirector, automatically adds
732 the standard headers to every HTTP request and handles gzipped and
733 deflated responses from web servers. If compression is to be avoided in
734 a particular request, the original request in the program code only has
735 to include the HTTP header "Youtubedl-No-Compression", which will be
736 removed before making the real request.
738 Part of this code was copied from:
740 http://techknack.net/python-urllib2-handlers/
742 Andrew Rowls, the author of that code, agreed to release it to the
749 return zlib.decompress(data, -zlib.MAX_WBITS)
751 return zlib.decompress(data)
754 def addinfourl_wrapper(stream, headers, url, code):
755 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
756 return compat_urllib_request.addinfourl(stream, headers, url, code)
757 ret = compat_urllib_request.addinfourl(stream, headers, url)
761 def http_request(self, req):
762 for h,v in std_headers.items():
766 if 'Youtubedl-no-compression' in req.headers:
767 if 'Accept-encoding' in req.headers:
768 del req.headers['Accept-encoding']
769 del req.headers['Youtubedl-no-compression']
770 if 'Youtubedl-user-agent' in req.headers:
771 if 'User-agent' in req.headers:
772 del req.headers['User-agent']
773 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
774 del req.headers['Youtubedl-user-agent']
777 def http_response(self, req, resp):
780 if resp.headers.get('Content-encoding', '') == 'gzip':
781 content = resp.read()
782 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
784 uncompressed = io.BytesIO(gz.read())
785 except IOError as original_ioerror:
786 # There may be junk add the end of the file
787 # See http://stackoverflow.com/q/4928560/35070 for details
788 for i in range(1, 1024):
790 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
791 uncompressed = io.BytesIO(gz.read())
796 raise original_ioerror
797 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
798 resp.msg = old_resp.msg
800 if resp.headers.get('Content-encoding', '') == 'deflate':
801 gz = io.BytesIO(self.deflate(resp.read()))
802 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
803 resp.msg = old_resp.msg
806 https_request = http_request
807 https_response = http_response
810 def parse_iso8601(date_str, delimiter='T'):
811 """ Return a UNIX timestamp from the given date """
817 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
820 timezone = datetime.timedelta()
822 date_str = date_str[:-len(m.group(0))]
823 if not m.group('sign'):
824 timezone = datetime.timedelta()
826 sign = 1 if m.group('sign') == '+' else -1
827 timezone = datetime.timedelta(
828 hours=sign * int(m.group('hours')),
829 minutes=sign * int(m.group('minutes')))
830 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
831 dt = datetime.datetime.strptime(date_str, date_format) - timezone
832 return calendar.timegm(dt.timetuple())
835 def unified_strdate(date_str):
836 """Return a string with the date in the format YYYYMMDD"""
843 date_str = date_str.replace(',', ' ')
844 # %z (UTC offset) is only supported in python>=3.2
845 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
846 format_expressions = [
851 '%b %dst %Y %I:%M%p',
852 '%b %dnd %Y %I:%M%p',
853 '%b %dth %Y %I:%M%p',
862 '%Y-%m-%dT%H:%M:%SZ',
863 '%Y-%m-%dT%H:%M:%S.%fZ',
864 '%Y-%m-%dT%H:%M:%S.%f0Z',
866 '%Y-%m-%dT%H:%M:%S.%f',
869 for expression in format_expressions:
871 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
874 if upload_date is None:
875 timetuple = email.utils.parsedate_tz(date_str)
877 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
880 def determine_ext(url, default_ext=u'unknown_video'):
883 guess = url.partition(u'?')[0].rpartition(u'.')[2]
884 if re.match(r'^[A-Za-z0-9]+$', guess):
889 def subtitles_filename(filename, sub_lang, sub_format):
890 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
892 def date_from_str(date_str):
894 Return a datetime object from a string in the format YYYYMMDD or
895 (now|today)[+-][0-9](day|week|month|year)(s)?"""
896 today = datetime.date.today()
897 if date_str == 'now'or date_str == 'today':
899 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
900 if match is not None:
901 sign = match.group('sign')
902 time = int(match.group('time'))
905 unit = match.group('unit')
914 delta = datetime.timedelta(**{unit: time})
916 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
918 def hyphenate_date(date_str):
920 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
921 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
922 if match is not None:
923 return '-'.join(match.groups())
927 class DateRange(object):
928 """Represents a time interval between two dates"""
929 def __init__(self, start=None, end=None):
930 """start and end must be strings in the format accepted by date"""
931 if start is not None:
932 self.start = date_from_str(start)
934 self.start = datetime.datetime.min.date()
936 self.end = date_from_str(end)
938 self.end = datetime.datetime.max.date()
939 if self.start > self.end:
940 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
943 """Returns a range that only contains the given day"""
945 def __contains__(self, date):
946 """Check if the date is in the range"""
947 if not isinstance(date, datetime.date):
948 date = date_from_str(date)
949 return self.start <= date <= self.end
951 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
955 """ Returns the platform name as a compat_str """
956 res = platform.platform()
957 if isinstance(res, bytes):
958 res = res.decode(preferredencoding())
960 assert isinstance(res, compat_str)
964 def _windows_write_string(s, out):
965 """ Returns True if the string was written using special methods,
966 False if it has yet to be written out."""
967 # Adapted from http://stackoverflow.com/a/3259271/35070
970 import ctypes.wintypes
978 fileno = out.fileno()
979 except AttributeError:
980 # If the output stream doesn't have a fileno, it's virtual
982 if fileno not in WIN_OUTPUT_IDS:
985 GetStdHandle = ctypes.WINFUNCTYPE(
986 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
987 ("GetStdHandle", ctypes.windll.kernel32))
988 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
990 WriteConsoleW = ctypes.WINFUNCTYPE(
991 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
992 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
993 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
994 written = ctypes.wintypes.DWORD(0)
996 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
997 FILE_TYPE_CHAR = 0x0002
998 FILE_TYPE_REMOTE = 0x8000
999 GetConsoleMode = ctypes.WINFUNCTYPE(
1000 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1001 ctypes.POINTER(ctypes.wintypes.DWORD))(
1002 ("GetConsoleMode", ctypes.windll.kernel32))
1003 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1005 def not_a_console(handle):
1006 if handle == INVALID_HANDLE_VALUE or handle is None:
1008 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1009 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1011 if not_a_console(h):
1014 def next_nonbmp_pos(s):
1016 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1017 except StopIteration:
1021 count = min(next_nonbmp_pos(s), 1024)
1023 ret = WriteConsoleW(
1024 h, s, count if count else 2, ctypes.byref(written), None)
1026 raise OSError('Failed to write string')
1027 if not count: # We just wrote a non-BMP character
1028 assert written.value == 2
1031 assert written.value > 0
1032 s = s[written.value:]
1036 def write_string(s, out=None, encoding=None):
1039 assert type(s) == compat_str
1041 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1042 if _windows_write_string(s, out):
1045 if ('b' in getattr(out, 'mode', '') or
1046 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1047 byt = s.encode(encoding or preferredencoding(), 'ignore')
1049 elif hasattr(out, 'buffer'):
1050 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1051 byt = s.encode(enc, 'ignore')
1052 out.buffer.write(byt)
1058 def bytes_to_intlist(bs):
1061 if isinstance(bs[0], int): # Python 3
1064 return [ord(c) for c in bs]
1067 def intlist_to_bytes(xs):
1070 if isinstance(chr(0), bytes): # Python 2
1071 return ''.join([chr(x) for x in xs])
1076 def get_cachedir(params={}):
1077 cache_root = os.environ.get('XDG_CACHE_HOME',
1078 os.path.expanduser('~/.cache'))
1079 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1082 # Cross-platform file locking
1083 if sys.platform == 'win32':
1084 import ctypes.wintypes
1087 class OVERLAPPED(ctypes.Structure):
1089 ('Internal', ctypes.wintypes.LPVOID),
1090 ('InternalHigh', ctypes.wintypes.LPVOID),
1091 ('Offset', ctypes.wintypes.DWORD),
1092 ('OffsetHigh', ctypes.wintypes.DWORD),
1093 ('hEvent', ctypes.wintypes.HANDLE),
1096 kernel32 = ctypes.windll.kernel32
1097 LockFileEx = kernel32.LockFileEx
1098 LockFileEx.argtypes = [
1099 ctypes.wintypes.HANDLE, # hFile
1100 ctypes.wintypes.DWORD, # dwFlags
1101 ctypes.wintypes.DWORD, # dwReserved
1102 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1103 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1104 ctypes.POINTER(OVERLAPPED) # Overlapped
1106 LockFileEx.restype = ctypes.wintypes.BOOL
1107 UnlockFileEx = kernel32.UnlockFileEx
1108 UnlockFileEx.argtypes = [
1109 ctypes.wintypes.HANDLE, # hFile
1110 ctypes.wintypes.DWORD, # dwReserved
1111 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1112 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1113 ctypes.POINTER(OVERLAPPED) # Overlapped
1115 UnlockFileEx.restype = ctypes.wintypes.BOOL
1116 whole_low = 0xffffffff
1117 whole_high = 0x7fffffff
1119 def _lock_file(f, exclusive):
1120 overlapped = OVERLAPPED()
1121 overlapped.Offset = 0
1122 overlapped.OffsetHigh = 0
1123 overlapped.hEvent = 0
1124 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1125 handle = msvcrt.get_osfhandle(f.fileno())
1126 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1127 whole_low, whole_high, f._lock_file_overlapped_p):
1128 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1130 def _unlock_file(f):
1131 assert f._lock_file_overlapped_p
1132 handle = msvcrt.get_osfhandle(f.fileno())
1133 if not UnlockFileEx(handle, 0,
1134 whole_low, whole_high, f._lock_file_overlapped_p):
1135 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1140 def _lock_file(f, exclusive):
1141 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1143 def _unlock_file(f):
1144 fcntl.lockf(f, fcntl.LOCK_UN)
1147 class locked_file(object):
1148 def __init__(self, filename, mode, encoding=None):
1149 assert mode in ['r', 'a', 'w']
1150 self.f = io.open(filename, mode, encoding=encoding)
1153 def __enter__(self):
1154 exclusive = self.mode != 'r'
1156 _lock_file(self.f, exclusive)
1162 def __exit__(self, etype, value, traceback):
1164 _unlock_file(self.f)
1171 def write(self, *args):
1172 return self.f.write(*args)
1174 def read(self, *args):
1175 return self.f.read(*args)
1178 def shell_quote(args):
1180 encoding = sys.getfilesystemencoding()
1181 if encoding is None:
1184 if isinstance(a, bytes):
1185 # We may get a filename encoded with 'encodeFilename'
1186 a = a.decode(encoding)
1187 quoted_args.append(pipes.quote(a))
1188 return u' '.join(quoted_args)
1191 def takewhile_inclusive(pred, seq):
1192 """ Like itertools.takewhile, but include the latest evaluated element
1193 (the first element so that Not pred(e)) """
1200 def smuggle_url(url, data):
1201 """ Pass additional data in a URL for internal use. """
1203 sdata = compat_urllib_parse.urlencode(
1204 {u'__youtubedl_smuggle': json.dumps(data)})
1205 return url + u'#' + sdata
1208 def unsmuggle_url(smug_url, default=None):
1209 if not '#__youtubedl_smuggle' in smug_url:
1210 return smug_url, default
1211 url, _, sdata = smug_url.rpartition(u'#')
1212 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1213 data = json.loads(jsond)
1217 def format_bytes(bytes):
1220 if type(bytes) is str:
1221 bytes = float(bytes)
1225 exponent = int(math.log(bytes, 1024.0))
1226 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1227 converted = float(bytes) / float(1024 ** exponent)
1228 return u'%.2f%s' % (converted, suffix)
1231 def get_term_width():
1232 columns = os.environ.get('COLUMNS', None)
1237 sp = subprocess.Popen(
1239 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1240 out, err = sp.communicate()
1241 return int(out.split()[1])
1247 def month_by_name(name):
1248 """ Return the number of a month by (locale-independently) English name """
1251 u'January', u'February', u'March', u'April', u'May', u'June',
1252 u'July', u'August', u'September', u'October', u'November', u'December']
1254 return ENGLISH_NAMES.index(name) + 1
1259 def fix_xml_ampersands(xml_str):
1260 """Replace all the '&' by '&' in XML"""
1262 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1267 def setproctitle(title):
1268 assert isinstance(title, compat_str)
1270 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1273 title_bytes = title.encode('utf-8')
1274 buf = ctypes.create_string_buffer(len(title_bytes))
1275 buf.value = title_bytes
1277 libc.prctl(15, buf, 0, 0, 0)
1278 except AttributeError:
1279 return # Strange libc, just skip this
1282 def remove_start(s, start):
1283 if s.startswith(start):
1284 return s[len(start):]
1288 def remove_end(s, end):
1290 return s[:-len(end)]
1294 def url_basename(url):
1295 path = compat_urlparse.urlparse(url).path
1296 return path.strip(u'/').split(u'/')[-1]
1299 class HEADRequest(compat_urllib_request.Request):
1300 def get_method(self):
1304 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1307 v = getattr(v, get_attr, None)
1310 return default if v is None else (int(v) * invscale // scale)
1313 def str_or_none(v, default=None):
1314 return default if v is None else compat_str(v)
1317 def str_to_int(int_str):
1320 int_str = re.sub(r'[,\.]', u'', int_str)
1324 def float_or_none(v, scale=1, invscale=1, default=None):
1325 return default if v is None else (float(v) * invscale / scale)
1328 def parse_duration(s):
1333 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1336 res = int(m.group('secs'))
1338 res += int(m.group('mins')) * 60
1339 if m.group('hours'):
1340 res += int(m.group('hours')) * 60 * 60
1344 def prepend_extension(filename, ext):
1345 name, real_ext = os.path.splitext(filename)
1346 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1349 def check_executable(exe, args=[]):
1350 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1351 args can be a list of arguments for a short output (like -version) """
1353 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1359 class PagedList(object):
1360 def __init__(self, pagefunc, pagesize):
1361 self._pagefunc = pagefunc
1362 self._pagesize = pagesize
1365 # This is only useful for tests
1366 return len(self.getslice())
1368 def getslice(self, start=0, end=None):
1370 for pagenum in itertools.count(start // self._pagesize):
1371 firstid = pagenum * self._pagesize
1372 nextfirstid = pagenum * self._pagesize + self._pagesize
1373 if start >= nextfirstid:
1376 page_results = list(self._pagefunc(pagenum))
1379 start % self._pagesize
1380 if firstid <= start < nextfirstid
1384 ((end - 1) % self._pagesize) + 1
1385 if (end is not None and firstid <= end <= nextfirstid)
1388 if startv != 0 or endv is not None:
1389 page_results = page_results[startv:endv]
1390 res.extend(page_results)
1392 # A little optimization - if current page is not "full", ie. does
1393 # not contain page_size videos then we can assume that this page
1394 # is the last one - there are no more ids on further pages -
1395 # i.e. no need to query again.
1396 if len(page_results) + startv < self._pagesize:
1399 # If we got the whole page, but the next page is not interesting,
1400 # break out early as well
1401 if end == nextfirstid:
1406 def uppercase_escape(s):
1407 unicode_escape = codecs.getdecoder('unicode_escape')
1409 r'\\U[0-9a-fA-F]{8}',
1410 lambda m: unicode_escape(m.group(0))[0],
1414 struct.pack(u'!I', 0)
1416 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1417 def struct_pack(spec, *args):
1418 if isinstance(spec, compat_str):
1419 spec = spec.encode('ascii')
1420 return struct.pack(spec, *args)
1422 def struct_unpack(spec, *args):
1423 if isinstance(spec, compat_str):
1424 spec = spec.encode('ascii')
1425 return struct.unpack(spec, *args)
1427 struct_pack = struct.pack
1428 struct_unpack = struct.unpack
1431 def read_batch_urls(batch_fd):
1433 if not isinstance(url, compat_str):
1434 url = url.decode('utf-8', 'replace')
1435 BOM_UTF8 = u'\xef\xbb\xbf'
1436 if url.startswith(BOM_UTF8):
1437 url = url[len(BOM_UTF8):]
1439 if url.startswith(('#', ';', ']')):
1443 with contextlib.closing(batch_fd) as fd:
1444 return [url for url in map(fixup, fd) if url]
1447 def urlencode_postdata(*args, **kargs):
1448 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1452 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1453 def doctype(self, name, pubid, system):
1454 pass # Ignore doctypes
1456 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1457 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1458 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1461 if sys.version_info < (3, 0) and sys.platform == 'win32':
1462 def compat_getpass(prompt, *args, **kwargs):
1463 if isinstance(prompt, compat_str):
1464 prompt = prompt.encode(preferredencoding())
1465 return getpass.getpass(prompt, *args, **kwargs)
1467 compat_getpass = getpass.getpass
1479 def strip_jsonp(code):
1480 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1483 def js_to_json(code):
1486 if key.startswith("'"):
1487 assert key.endswith("'")
1488 assert '"' not in key
1489 key = '"%s"' % key[1:-1]
1490 elif not key.startswith('"'):
1494 if value.startswith("'"):
1495 assert value.endswith("'")
1496 assert '"' not in value
1497 value = '"%s"' % value[1:-1]
1499 return m.group(1) + key + m.group(3) + value
1501 res = re.sub(r'''(?x)
1503 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1505 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1507 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1511 def qualities(quality_ids):
1512 """ Get a numeric quality value out of a list of possible values """
1515 return quality_ids.index(qid)
1521 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1524 subprocess_check_output = subprocess.check_output
1525 except AttributeError:
1526 def subprocess_check_output(*args, **kwargs):
1527 assert 'input' not in kwargs
1528 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1529 output, _ = p.communicate()
1532 raise subprocess.CalledProcessError(ret, p.args, output=output)