2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 if type(c) is int: return c
199 # This is not clearly defined otherwise
200 compiled_regex_type = type(re.compile(''))
203 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
204 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
205 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
206 'Accept-Encoding': 'gzip, deflate',
207 'Accept-Language': 'en-us,en;q=0.5',
210 def preferredencoding():
211 """Get preferred encoding.
213 Returns the best encoding scheme for the system, based on
214 locale.getpreferredencoding() and some further tweaks.
217 pref = locale.getpreferredencoding()
224 if sys.version_info < (3,0):
226 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
229 assert type(s) == type(u'')
233 def write_json_file(obj, fn):
234 """ Encode obj as JSON and write it to fn, atomically """
236 # In Python 2.x, json.dump expects a bytestream.
237 # In Python 3.x, it writes to a character stream
238 if sys.version_info < (3, 0):
244 tf = tempfile.NamedTemporaryFile(
245 suffix='.tmp', prefix=os.path.basename(fn) + '.',
246 dir=os.path.dirname(fn),
252 os.rename(tf.name, fn)
261 if sys.version_info >= (2, 7):
262 def find_xpath_attr(node, xpath, key, val):
263 """ Find the xpath xpath[@key=val] """
264 assert re.match(r'^[a-zA-Z-]+$', key)
265 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
266 expr = xpath + u"[@%s='%s']" % (key, val)
267 return node.find(expr)
269 def find_xpath_attr(node, xpath, key, val):
270 for f in node.findall(xpath):
271 if f.attrib.get(key) == val:
275 # On python2.6 the xml.etree.ElementTree.Element methods don't support
276 # the namespace parameter
277 def xpath_with_ns(path, ns_map):
278 components = [c.split(':') for c in path.split('/')]
282 replaced.append(c[0])
285 replaced.append('{%s}%s' % (ns_map[ns], tag))
286 return '/'.join(replaced)
288 def htmlentity_transform(matchobj):
289 """Transforms an HTML entity to a character.
291 This function receives a match object and is intended to be used with
292 the re.sub() function.
294 entity = matchobj.group(1)
296 # Known non-numeric HTML entity
297 if entity in compat_html_entities.name2codepoint:
298 return compat_chr(compat_html_entities.name2codepoint[entity])
300 mobj = re.match(u'(?u)#(x?\\d+)', entity)
302 numstr = mobj.group(1)
303 if numstr.startswith(u'x'):
305 numstr = u'0%s' % numstr
308 return compat_chr(int(numstr, base))
310 # Unknown entity in name, return its literal representation
311 return (u'&%s;' % entity)
313 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
314 class BaseHTMLParser(compat_html_parser.HTMLParser):
316 compat_html_parser.HTMLParser.__init__(self)
319 def loads(self, html):
324 class AttrParser(BaseHTMLParser):
325 """Modified HTMLParser that isolates a tag with the specified attribute"""
326 def __init__(self, attribute, value):
327 self.attribute = attribute
332 self.watch_startpos = False
334 BaseHTMLParser.__init__(self)
336 def error(self, message):
337 if self.error_count > 10 or self.started:
338 raise compat_html_parser.HTMLParseError(message, self.getpos())
339 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
340 self.error_count += 1
343 def handle_starttag(self, tag, attrs):
346 self.find_startpos(None)
347 if self.attribute in attrs and attrs[self.attribute] == self.value:
350 self.watch_startpos = True
352 if not tag in self.depth: self.depth[tag] = 0
355 def handle_endtag(self, tag):
357 if tag in self.depth: self.depth[tag] -= 1
358 if self.depth[self.result[0]] == 0:
360 self.result.append(self.getpos())
362 def find_startpos(self, x):
363 """Needed to put the start position of the result (self.result[1])
364 after the opening tag with the requested id"""
365 if self.watch_startpos:
366 self.watch_startpos = False
367 self.result.append(self.getpos())
368 handle_entityref = handle_charref = handle_data = handle_comment = \
369 handle_decl = handle_pi = unknown_decl = find_startpos
371 def get_result(self):
372 if self.result is None:
374 if len(self.result) != 3:
376 lines = self.html.split('\n')
377 lines = lines[self.result[1][0]-1:self.result[2][0]]
378 lines[0] = lines[0][self.result[1][1]:]
380 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
381 lines[-1] = lines[-1][:self.result[2][1]]
382 return '\n'.join(lines).strip()
383 # Hack for https://github.com/rg3/youtube-dl/issues/662
384 if sys.version_info < (2, 7, 3):
385 AttrParser.parse_endtag = (lambda self, i:
386 i + len("</scr'+'ipt>")
387 if self.rawdata[i:].startswith("</scr'+'ipt>")
388 else compat_html_parser.HTMLParser.parse_endtag(self, i))
390 def get_element_by_id(id, html):
391 """Return the content of the tag with the specified ID in the passed HTML document"""
392 return get_element_by_attribute("id", id, html)
394 def get_element_by_attribute(attribute, value, html):
395 """Return the content of the tag with the specified attribute in the passed HTML document"""
396 parser = AttrParser(attribute, value)
399 except compat_html_parser.HTMLParseError:
401 return parser.get_result()
403 class MetaParser(BaseHTMLParser):
405 Modified HTMLParser that isolates a meta tag with the specified name
408 def __init__(self, name):
409 BaseHTMLParser.__init__(self)
414 def handle_starttag(self, tag, attrs):
418 if attrs.get('name') == self.name:
419 self.result = attrs.get('content')
421 def get_result(self):
424 def get_meta_content(name, html):
426 Return the content attribute from the meta tag with the given name attribute.
428 parser = MetaParser(name)
431 except compat_html_parser.HTMLParseError:
433 return parser.get_result()
436 def clean_html(html):
437 """Clean an HTML snippet into a readable string"""
439 html = html.replace('\n', ' ')
440 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
441 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
443 html = re.sub('<.*?>', '', html)
444 # Replace html entities
445 html = unescapeHTML(html)
449 def sanitize_open(filename, open_mode):
450 """Try to open the given filename, and slightly tweak it if this fails.
452 Attempts to open the given filename. If this fails, it tries to change
453 the filename slightly, step by step, until it's either able to open it
454 or it fails and raises a final exception, like the standard open()
457 It returns the tuple (stream, definitive_file_name).
461 if sys.platform == 'win32':
463 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
464 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
465 stream = open(encodeFilename(filename), open_mode)
466 return (stream, filename)
467 except (IOError, OSError) as err:
468 if err.errno in (errno.EACCES,):
471 # In case of error, try to remove win32 forbidden chars
472 alt_filename = os.path.join(
473 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
474 for path_part in os.path.split(filename)
476 if alt_filename == filename:
479 # An exception here should be caught in the caller
480 stream = open(encodeFilename(filename), open_mode)
481 return (stream, alt_filename)
484 def timeconvert(timestr):
485 """Convert RFC 2822 defined time string into system timestamp"""
487 timetuple = email.utils.parsedate_tz(timestr)
488 if timetuple is not None:
489 timestamp = email.utils.mktime_tz(timetuple)
492 def sanitize_filename(s, restricted=False, is_id=False):
493 """Sanitizes a string so it could be used as part of a filename.
494 If restricted is set, use a stricter subset of allowed characters.
495 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
497 def replace_insane(char):
498 if char == '?' or ord(char) < 32 or ord(char) == 127:
501 return '' if restricted else '\''
503 return '_-' if restricted else ' -'
504 elif char in '\\/|*<>':
506 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
508 if restricted and ord(char) > 127:
512 result = u''.join(map(replace_insane, s))
514 while '__' in result:
515 result = result.replace('__', '_')
516 result = result.strip('_')
517 # Common case of "Foreign band name - English song title"
518 if restricted and result.startswith('-_'):
524 def orderedSet(iterable):
525 """ Remove all duplicates from the input iterable """
536 assert type(s) == compat_str
538 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
542 def encodeFilename(s, for_subprocess=False):
544 @param s The name of the file
547 assert type(s) == compat_str
549 # Python 3 has a Unicode API
550 if sys.version_info >= (3, 0):
553 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
554 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
555 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
556 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
557 if not for_subprocess:
560 # For subprocess calls, encode with locale encoding
561 # Refer to http://stackoverflow.com/a/9951851/35070
562 encoding = preferredencoding()
564 encoding = sys.getfilesystemencoding()
567 return s.encode(encoding, 'ignore')
570 def encodeArgument(s):
571 if not isinstance(s, compat_str):
572 # Legacy code that uses byte strings
573 # Uncomment the following line after fixing all post processors
574 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
575 s = s.decode('ascii')
576 return encodeFilename(s, True)
579 def decodeOption(optval):
582 if isinstance(optval, bytes):
583 optval = optval.decode(preferredencoding())
585 assert isinstance(optval, compat_str)
588 def formatSeconds(secs):
590 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
592 return '%d:%02d' % (secs // 60, secs % 60)
597 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
598 if sys.version_info < (3, 2):
601 class HTTPSConnectionV3(httplib.HTTPSConnection):
602 def __init__(self, *args, **kwargs):
603 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
606 sock = socket.create_connection((self.host, self.port), self.timeout)
607 if getattr(self, '_tunnel_host', False):
611 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
613 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
615 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
616 def https_open(self, req):
617 return self.do_open(HTTPSConnectionV3, req)
618 return HTTPSHandlerV3(**kwargs)
620 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
621 context.verify_mode = (ssl.CERT_NONE
622 if opts_no_check_certificate
623 else ssl.CERT_REQUIRED)
624 context.set_default_verify_paths()
626 context.load_default_certs()
627 except AttributeError:
629 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
631 class ExtractorError(Exception):
632 """Error during info extraction."""
633 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
634 """ tb, if given, is the original traceback (so that it can be printed out).
635 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
638 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
640 if video_id is not None:
641 msg = video_id + ': ' + msg
643 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
644 super(ExtractorError, self).__init__(msg)
647 self.exc_info = sys.exc_info() # preserve original exception
649 self.video_id = video_id
651 def format_traceback(self):
652 if self.traceback is None:
654 return u''.join(traceback.format_tb(self.traceback))
657 class RegexNotFoundError(ExtractorError):
658 """Error when a regex didn't match"""
662 class DownloadError(Exception):
663 """Download Error exception.
665 This exception may be thrown by FileDownloader objects if they are not
666 configured to continue on errors. They will contain the appropriate
669 def __init__(self, msg, exc_info=None):
670 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
671 super(DownloadError, self).__init__(msg)
672 self.exc_info = exc_info
675 class SameFileError(Exception):
676 """Same File exception.
678 This exception will be thrown by FileDownloader objects if they detect
679 multiple files would have to be downloaded to the same file on disk.
684 class PostProcessingError(Exception):
685 """Post Processing exception.
687 This exception may be raised by PostProcessor's .run() method to
688 indicate an error in the postprocessing task.
690 def __init__(self, msg):
693 class MaxDownloadsReached(Exception):
694 """ --max-downloads limit has been reached. """
698 class UnavailableVideoError(Exception):
699 """Unavailable Format exception.
701 This exception will be thrown when a video is requested
702 in a format that is not available for that video.
707 class ContentTooShortError(Exception):
708 """Content Too Short exception.
710 This exception may be raised by FileDownloader objects when a file they
711 download is too small for what the server announced first, indicating
712 the connection was probably interrupted.
718 def __init__(self, downloaded, expected):
719 self.downloaded = downloaded
720 self.expected = expected
722 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
723 """Handler for HTTP requests and responses.
725 This class, when installed with an OpenerDirector, automatically adds
726 the standard headers to every HTTP request and handles gzipped and
727 deflated responses from web servers. If compression is to be avoided in
728 a particular request, the original request in the program code only has
729 to include the HTTP header "Youtubedl-No-Compression", which will be
730 removed before making the real request.
732 Part of this code was copied from:
734 http://techknack.net/python-urllib2-handlers/
736 Andrew Rowls, the author of that code, agreed to release it to the
743 return zlib.decompress(data, -zlib.MAX_WBITS)
745 return zlib.decompress(data)
748 def addinfourl_wrapper(stream, headers, url, code):
749 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
750 return compat_urllib_request.addinfourl(stream, headers, url, code)
751 ret = compat_urllib_request.addinfourl(stream, headers, url)
755 def http_request(self, req):
756 for h,v in std_headers.items():
760 if 'Youtubedl-no-compression' in req.headers:
761 if 'Accept-encoding' in req.headers:
762 del req.headers['Accept-encoding']
763 del req.headers['Youtubedl-no-compression']
764 if 'Youtubedl-user-agent' in req.headers:
765 if 'User-agent' in req.headers:
766 del req.headers['User-agent']
767 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
768 del req.headers['Youtubedl-user-agent']
771 def http_response(self, req, resp):
774 if resp.headers.get('Content-encoding', '') == 'gzip':
775 content = resp.read()
776 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
778 uncompressed = io.BytesIO(gz.read())
779 except IOError as original_ioerror:
780 # There may be junk add the end of the file
781 # See http://stackoverflow.com/q/4928560/35070 for details
782 for i in range(1, 1024):
784 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
785 uncompressed = io.BytesIO(gz.read())
790 raise original_ioerror
791 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
792 resp.msg = old_resp.msg
794 if resp.headers.get('Content-encoding', '') == 'deflate':
795 gz = io.BytesIO(self.deflate(resp.read()))
796 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
797 resp.msg = old_resp.msg
800 https_request = http_request
801 https_response = http_response
804 def parse_iso8601(date_str, delimiter='T'):
805 """ Return a UNIX timestamp from the given date """
811 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
814 timezone = datetime.timedelta()
816 date_str = date_str[:-len(m.group(0))]
817 if not m.group('sign'):
818 timezone = datetime.timedelta()
820 sign = 1 if m.group('sign') == '+' else -1
821 timezone = datetime.timedelta(
822 hours=sign * int(m.group('hours')),
823 minutes=sign * int(m.group('minutes')))
824 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
825 dt = datetime.datetime.strptime(date_str, date_format) - timezone
826 return calendar.timegm(dt.timetuple())
829 def unified_strdate(date_str):
830 """Return a string with the date in the format YYYYMMDD"""
837 date_str = date_str.replace(',', ' ')
838 # %z (UTC offset) is only supported in python>=3.2
839 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
840 format_expressions = [
845 '%b %dst %Y %I:%M%p',
846 '%b %dnd %Y %I:%M%p',
847 '%b %dth %Y %I:%M%p',
856 '%Y-%m-%dT%H:%M:%SZ',
857 '%Y-%m-%dT%H:%M:%S.%fZ',
858 '%Y-%m-%dT%H:%M:%S.%f0Z',
860 '%Y-%m-%dT%H:%M:%S.%f',
863 for expression in format_expressions:
865 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
868 if upload_date is None:
869 timetuple = email.utils.parsedate_tz(date_str)
871 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
874 def determine_ext(url, default_ext=u'unknown_video'):
877 guess = url.partition(u'?')[0].rpartition(u'.')[2]
878 if re.match(r'^[A-Za-z0-9]+$', guess):
883 def subtitles_filename(filename, sub_lang, sub_format):
884 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
886 def date_from_str(date_str):
888 Return a datetime object from a string in the format YYYYMMDD or
889 (now|today)[+-][0-9](day|week|month|year)(s)?"""
890 today = datetime.date.today()
891 if date_str == 'now'or date_str == 'today':
893 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
894 if match is not None:
895 sign = match.group('sign')
896 time = int(match.group('time'))
899 unit = match.group('unit')
908 delta = datetime.timedelta(**{unit: time})
910 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
912 def hyphenate_date(date_str):
914 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
915 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
916 if match is not None:
917 return '-'.join(match.groups())
921 class DateRange(object):
922 """Represents a time interval between two dates"""
923 def __init__(self, start=None, end=None):
924 """start and end must be strings in the format accepted by date"""
925 if start is not None:
926 self.start = date_from_str(start)
928 self.start = datetime.datetime.min.date()
930 self.end = date_from_str(end)
932 self.end = datetime.datetime.max.date()
933 if self.start > self.end:
934 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
937 """Returns a range that only contains the given day"""
939 def __contains__(self, date):
940 """Check if the date is in the range"""
941 if not isinstance(date, datetime.date):
942 date = date_from_str(date)
943 return self.start <= date <= self.end
945 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
949 """ Returns the platform name as a compat_str """
950 res = platform.platform()
951 if isinstance(res, bytes):
952 res = res.decode(preferredencoding())
954 assert isinstance(res, compat_str)
958 def _windows_write_string(s, out):
959 """ Returns True if the string was written using special methods,
960 False if it has yet to be written out."""
961 # Adapted from http://stackoverflow.com/a/3259271/35070
964 import ctypes.wintypes
972 fileno = out.fileno()
973 except AttributeError:
974 # If the output stream doesn't have a fileno, it's virtual
976 if fileno not in WIN_OUTPUT_IDS:
979 GetStdHandle = ctypes.WINFUNCTYPE(
980 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
981 ("GetStdHandle", ctypes.windll.kernel32))
982 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
984 WriteConsoleW = ctypes.WINFUNCTYPE(
985 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
986 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
987 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
988 written = ctypes.wintypes.DWORD(0)
990 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
991 FILE_TYPE_CHAR = 0x0002
992 FILE_TYPE_REMOTE = 0x8000
993 GetConsoleMode = ctypes.WINFUNCTYPE(
994 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
995 ctypes.POINTER(ctypes.wintypes.DWORD))(
996 ("GetConsoleMode", ctypes.windll.kernel32))
997 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
999 def not_a_console(handle):
1000 if handle == INVALID_HANDLE_VALUE or handle is None:
1002 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1003 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1005 if not_a_console(h):
1008 def next_nonbmp_pos(s):
1010 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1011 except StopIteration:
1015 count = min(next_nonbmp_pos(s), 1024)
1017 ret = WriteConsoleW(
1018 h, s, count if count else 2, ctypes.byref(written), None)
1020 raise OSError('Failed to write string')
1021 if not count: # We just wrote a non-BMP character
1022 assert written.value == 2
1025 assert written.value > 0
1026 s = s[written.value:]
1030 def write_string(s, out=None, encoding=None):
1033 assert type(s) == compat_str
1035 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1036 if _windows_write_string(s, out):
1039 if ('b' in getattr(out, 'mode', '') or
1040 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1041 byt = s.encode(encoding or preferredencoding(), 'ignore')
1043 elif hasattr(out, 'buffer'):
1044 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1045 byt = s.encode(enc, 'ignore')
1046 out.buffer.write(byt)
1052 def bytes_to_intlist(bs):
1055 if isinstance(bs[0], int): # Python 3
1058 return [ord(c) for c in bs]
1061 def intlist_to_bytes(xs):
1064 if isinstance(chr(0), bytes): # Python 2
1065 return ''.join([chr(x) for x in xs])
1070 def get_cachedir(params={}):
1071 cache_root = os.environ.get('XDG_CACHE_HOME',
1072 os.path.expanduser('~/.cache'))
1073 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1076 # Cross-platform file locking
1077 if sys.platform == 'win32':
1078 import ctypes.wintypes
1081 class OVERLAPPED(ctypes.Structure):
1083 ('Internal', ctypes.wintypes.LPVOID),
1084 ('InternalHigh', ctypes.wintypes.LPVOID),
1085 ('Offset', ctypes.wintypes.DWORD),
1086 ('OffsetHigh', ctypes.wintypes.DWORD),
1087 ('hEvent', ctypes.wintypes.HANDLE),
1090 kernel32 = ctypes.windll.kernel32
1091 LockFileEx = kernel32.LockFileEx
1092 LockFileEx.argtypes = [
1093 ctypes.wintypes.HANDLE, # hFile
1094 ctypes.wintypes.DWORD, # dwFlags
1095 ctypes.wintypes.DWORD, # dwReserved
1096 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1097 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1098 ctypes.POINTER(OVERLAPPED) # Overlapped
1100 LockFileEx.restype = ctypes.wintypes.BOOL
1101 UnlockFileEx = kernel32.UnlockFileEx
1102 UnlockFileEx.argtypes = [
1103 ctypes.wintypes.HANDLE, # hFile
1104 ctypes.wintypes.DWORD, # dwReserved
1105 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1106 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1107 ctypes.POINTER(OVERLAPPED) # Overlapped
1109 UnlockFileEx.restype = ctypes.wintypes.BOOL
1110 whole_low = 0xffffffff
1111 whole_high = 0x7fffffff
1113 def _lock_file(f, exclusive):
1114 overlapped = OVERLAPPED()
1115 overlapped.Offset = 0
1116 overlapped.OffsetHigh = 0
1117 overlapped.hEvent = 0
1118 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1119 handle = msvcrt.get_osfhandle(f.fileno())
1120 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1121 whole_low, whole_high, f._lock_file_overlapped_p):
1122 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1124 def _unlock_file(f):
1125 assert f._lock_file_overlapped_p
1126 handle = msvcrt.get_osfhandle(f.fileno())
1127 if not UnlockFileEx(handle, 0,
1128 whole_low, whole_high, f._lock_file_overlapped_p):
1129 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1134 def _lock_file(f, exclusive):
1135 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1137 def _unlock_file(f):
1138 fcntl.lockf(f, fcntl.LOCK_UN)
1141 class locked_file(object):
1142 def __init__(self, filename, mode, encoding=None):
1143 assert mode in ['r', 'a', 'w']
1144 self.f = io.open(filename, mode, encoding=encoding)
1147 def __enter__(self):
1148 exclusive = self.mode != 'r'
1150 _lock_file(self.f, exclusive)
1156 def __exit__(self, etype, value, traceback):
1158 _unlock_file(self.f)
1165 def write(self, *args):
1166 return self.f.write(*args)
1168 def read(self, *args):
1169 return self.f.read(*args)
1172 def shell_quote(args):
1174 encoding = sys.getfilesystemencoding()
1175 if encoding is None:
1178 if isinstance(a, bytes):
1179 # We may get a filename encoded with 'encodeFilename'
1180 a = a.decode(encoding)
1181 quoted_args.append(pipes.quote(a))
1182 return u' '.join(quoted_args)
1185 def takewhile_inclusive(pred, seq):
1186 """ Like itertools.takewhile, but include the latest evaluated element
1187 (the first element so that Not pred(e)) """
1194 def smuggle_url(url, data):
1195 """ Pass additional data in a URL for internal use. """
1197 sdata = compat_urllib_parse.urlencode(
1198 {u'__youtubedl_smuggle': json.dumps(data)})
1199 return url + u'#' + sdata
1202 def unsmuggle_url(smug_url, default=None):
1203 if not '#__youtubedl_smuggle' in smug_url:
1204 return smug_url, default
1205 url, _, sdata = smug_url.rpartition(u'#')
1206 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1207 data = json.loads(jsond)
1211 def format_bytes(bytes):
1214 if type(bytes) is str:
1215 bytes = float(bytes)
1219 exponent = int(math.log(bytes, 1024.0))
1220 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1221 converted = float(bytes) / float(1024 ** exponent)
1222 return u'%.2f%s' % (converted, suffix)
1225 def get_term_width():
1226 columns = os.environ.get('COLUMNS', None)
1231 sp = subprocess.Popen(
1233 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1234 out, err = sp.communicate()
1235 return int(out.split()[1])
1241 def month_by_name(name):
1242 """ Return the number of a month by (locale-independently) English name """
1245 u'January', u'February', u'March', u'April', u'May', u'June',
1246 u'July', u'August', u'September', u'October', u'November', u'December']
1248 return ENGLISH_NAMES.index(name) + 1
1253 def fix_xml_ampersands(xml_str):
1254 """Replace all the '&' by '&' in XML"""
1256 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1261 def setproctitle(title):
1262 assert isinstance(title, compat_str)
1264 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1267 title_bytes = title.encode('utf-8')
1268 buf = ctypes.create_string_buffer(len(title_bytes))
1269 buf.value = title_bytes
1271 libc.prctl(15, buf, 0, 0, 0)
1272 except AttributeError:
1273 return # Strange libc, just skip this
1276 def remove_start(s, start):
1277 if s.startswith(start):
1278 return s[len(start):]
1282 def url_basename(url):
1283 path = compat_urlparse.urlparse(url).path
1284 return path.strip(u'/').split(u'/')[-1]
1287 class HEADRequest(compat_urllib_request.Request):
1288 def get_method(self):
1292 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1295 v = getattr(v, get_attr, None)
1298 return default if v is None else (int(v) * invscale // scale)
1301 def str_or_none(v, default=None):
1302 return default if v is None else compat_str(v)
1305 def str_to_int(int_str):
1308 int_str = re.sub(r'[,\.]', u'', int_str)
1312 def float_or_none(v, scale=1, invscale=1, default=None):
1313 return default if v is None else (float(v) * invscale / scale)
1316 def parse_duration(s):
1321 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1324 res = int(m.group('secs'))
1326 res += int(m.group('mins')) * 60
1327 if m.group('hours'):
1328 res += int(m.group('hours')) * 60 * 60
1332 def prepend_extension(filename, ext):
1333 name, real_ext = os.path.splitext(filename)
1334 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1337 def check_executable(exe, args=[]):
1338 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1339 args can be a list of arguments for a short output (like -version) """
1341 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1347 class PagedList(object):
1348 def __init__(self, pagefunc, pagesize):
1349 self._pagefunc = pagefunc
1350 self._pagesize = pagesize
1353 # This is only useful for tests
1354 return len(self.getslice())
1356 def getslice(self, start=0, end=None):
1358 for pagenum in itertools.count(start // self._pagesize):
1359 firstid = pagenum * self._pagesize
1360 nextfirstid = pagenum * self._pagesize + self._pagesize
1361 if start >= nextfirstid:
1364 page_results = list(self._pagefunc(pagenum))
1367 start % self._pagesize
1368 if firstid <= start < nextfirstid
1372 ((end - 1) % self._pagesize) + 1
1373 if (end is not None and firstid <= end <= nextfirstid)
1376 if startv != 0 or endv is not None:
1377 page_results = page_results[startv:endv]
1378 res.extend(page_results)
1380 # A little optimization - if current page is not "full", ie. does
1381 # not contain page_size videos then we can assume that this page
1382 # is the last one - there are no more ids on further pages -
1383 # i.e. no need to query again.
1384 if len(page_results) + startv < self._pagesize:
1387 # If we got the whole page, but the next page is not interesting,
1388 # break out early as well
1389 if end == nextfirstid:
1394 def uppercase_escape(s):
1395 unicode_escape = codecs.getdecoder('unicode_escape')
1397 r'\\U[0-9a-fA-F]{8}',
1398 lambda m: unicode_escape(m.group(0))[0],
1402 struct.pack(u'!I', 0)
1404 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1405 def struct_pack(spec, *args):
1406 if isinstance(spec, compat_str):
1407 spec = spec.encode('ascii')
1408 return struct.pack(spec, *args)
1410 def struct_unpack(spec, *args):
1411 if isinstance(spec, compat_str):
1412 spec = spec.encode('ascii')
1413 return struct.unpack(spec, *args)
1415 struct_pack = struct.pack
1416 struct_unpack = struct.unpack
1419 def read_batch_urls(batch_fd):
1421 if not isinstance(url, compat_str):
1422 url = url.decode('utf-8', 'replace')
1423 BOM_UTF8 = u'\xef\xbb\xbf'
1424 if url.startswith(BOM_UTF8):
1425 url = url[len(BOM_UTF8):]
1427 if url.startswith(('#', ';', ']')):
1431 with contextlib.closing(batch_fd) as fd:
1432 return [url for url in map(fixup, fd) if url]
1435 def urlencode_postdata(*args, **kargs):
1436 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1440 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1441 def doctype(self, name, pubid, system):
1442 pass # Ignore doctypes
1444 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1445 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1446 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1449 if sys.version_info < (3, 0) and sys.platform == 'win32':
1450 def compat_getpass(prompt, *args, **kwargs):
1451 if isinstance(prompt, compat_str):
1452 prompt = prompt.encode(preferredencoding())
1453 return getpass.getpass(prompt, *args, **kwargs)
1455 compat_getpass = getpass.getpass
1467 def strip_jsonp(code):
1468 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1471 def qualities(quality_ids):
1472 """ Get a numeric quality value out of a list of possible values """
1475 return quality_ids.index(qid)
1481 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1484 subprocess_check_output = subprocess.check_output
1485 except AttributeError:
1486 def subprocess_check_output(*args, **kwargs):
1487 assert 'input' not in kwargs
1488 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1489 output, _ = p.communicate()
1492 raise subprocess.CalledProcessError(ret, p.args, output=output)