2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
94 from urllib.parse import unquote as compat_urllib_parse_unquote
96 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
99 res = string.split('%')
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
113 pct_sequence += item[:2].decode('hex')
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
122 # Encountered non-percent-encoded characters. Flush the current
124 string += pct_sequence.decode(encoding, errors) + rest
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
133 from urllib.parse import parse_qs as compat_parse_qs
134 except ImportError: # Python 2
135 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
136 # Python 2's version is apparently totally broken
138 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
139 encoding='utf-8', errors='replace'):
140 qs, _coerce_result = qs, unicode
141 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 for name_value in pairs:
144 if not name_value and not strict_parsing:
146 nv = name_value.split('=', 1)
149 raise ValueError("bad query field: %r" % (name_value,))
150 # Handle case of a control-name with no equal sign
151 if keep_blank_values:
155 if len(nv[1]) or keep_blank_values:
156 name = nv[0].replace('+', ' ')
157 name = compat_urllib_parse_unquote(
158 name, encoding=encoding, errors=errors)
159 name = _coerce_result(name)
160 value = nv[1].replace('+', ' ')
161 value = compat_urllib_parse_unquote(
162 value, encoding=encoding, errors=errors)
163 value = _coerce_result(value)
164 r.append((name, value))
167 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
168 encoding='utf-8', errors='replace'):
170 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
171 encoding=encoding, errors=errors)
172 for name, value in pairs:
173 if name in parsed_result:
174 parsed_result[name].append(value)
176 parsed_result[name] = [value]
180 compat_str = unicode # Python 2
185 compat_chr = unichr # Python 2
190 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
191 except ImportError: # Python 2.6
192 from xml.parsers.expat import ExpatError as compat_xml_parse_error
195 if type(c) is int: return c
198 # This is not clearly defined otherwise
199 compiled_regex_type = type(re.compile(''))
202 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
203 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
204 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
205 'Accept-Encoding': 'gzip, deflate',
206 'Accept-Language': 'en-us,en;q=0.5',
209 def preferredencoding():
210 """Get preferred encoding.
212 Returns the best encoding scheme for the system, based on
213 locale.getpreferredencoding() and some further tweaks.
216 pref = locale.getpreferredencoding()
223 if sys.version_info < (3,0):
225 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
228 assert type(s) == type(u'')
231 # In Python 2.x, json.dump expects a bytestream.
232 # In Python 3.x, it writes to a character stream
233 if sys.version_info < (3,0):
234 def write_json_file(obj, fn):
235 with open(fn, 'wb') as f:
238 def write_json_file(obj, fn):
239 with open(fn, 'w', encoding='utf-8') as f:
242 if sys.version_info >= (2,7):
243 def find_xpath_attr(node, xpath, key, val):
244 """ Find the xpath xpath[@key=val] """
245 assert re.match(r'^[a-zA-Z-]+$', key)
246 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
247 expr = xpath + u"[@%s='%s']" % (key, val)
248 return node.find(expr)
250 def find_xpath_attr(node, xpath, key, val):
251 for f in node.findall(xpath):
252 if f.attrib.get(key) == val:
256 # On python2.6 the xml.etree.ElementTree.Element methods don't support
257 # the namespace parameter
258 def xpath_with_ns(path, ns_map):
259 components = [c.split(':') for c in path.split('/')]
263 replaced.append(c[0])
266 replaced.append('{%s}%s' % (ns_map[ns], tag))
267 return '/'.join(replaced)
269 def htmlentity_transform(matchobj):
270 """Transforms an HTML entity to a character.
272 This function receives a match object and is intended to be used with
273 the re.sub() function.
275 entity = matchobj.group(1)
277 # Known non-numeric HTML entity
278 if entity in compat_html_entities.name2codepoint:
279 return compat_chr(compat_html_entities.name2codepoint[entity])
281 mobj = re.match(u'(?u)#(x?\\d+)', entity)
283 numstr = mobj.group(1)
284 if numstr.startswith(u'x'):
286 numstr = u'0%s' % numstr
289 return compat_chr(int(numstr, base))
291 # Unknown entity in name, return its literal representation
292 return (u'&%s;' % entity)
294 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
295 class BaseHTMLParser(compat_html_parser.HTMLParser):
297 compat_html_parser.HTMLParser.__init__(self)
300 def loads(self, html):
305 class AttrParser(BaseHTMLParser):
306 """Modified HTMLParser that isolates a tag with the specified attribute"""
307 def __init__(self, attribute, value):
308 self.attribute = attribute
313 self.watch_startpos = False
315 BaseHTMLParser.__init__(self)
317 def error(self, message):
318 if self.error_count > 10 or self.started:
319 raise compat_html_parser.HTMLParseError(message, self.getpos())
320 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
321 self.error_count += 1
324 def handle_starttag(self, tag, attrs):
327 self.find_startpos(None)
328 if self.attribute in attrs and attrs[self.attribute] == self.value:
331 self.watch_startpos = True
333 if not tag in self.depth: self.depth[tag] = 0
336 def handle_endtag(self, tag):
338 if tag in self.depth: self.depth[tag] -= 1
339 if self.depth[self.result[0]] == 0:
341 self.result.append(self.getpos())
343 def find_startpos(self, x):
344 """Needed to put the start position of the result (self.result[1])
345 after the opening tag with the requested id"""
346 if self.watch_startpos:
347 self.watch_startpos = False
348 self.result.append(self.getpos())
349 handle_entityref = handle_charref = handle_data = handle_comment = \
350 handle_decl = handle_pi = unknown_decl = find_startpos
352 def get_result(self):
353 if self.result is None:
355 if len(self.result) != 3:
357 lines = self.html.split('\n')
358 lines = lines[self.result[1][0]-1:self.result[2][0]]
359 lines[0] = lines[0][self.result[1][1]:]
361 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
362 lines[-1] = lines[-1][:self.result[2][1]]
363 return '\n'.join(lines).strip()
364 # Hack for https://github.com/rg3/youtube-dl/issues/662
365 if sys.version_info < (2, 7, 3):
366 AttrParser.parse_endtag = (lambda self, i:
367 i + len("</scr'+'ipt>")
368 if self.rawdata[i:].startswith("</scr'+'ipt>")
369 else compat_html_parser.HTMLParser.parse_endtag(self, i))
371 def get_element_by_id(id, html):
372 """Return the content of the tag with the specified ID in the passed HTML document"""
373 return get_element_by_attribute("id", id, html)
375 def get_element_by_attribute(attribute, value, html):
376 """Return the content of the tag with the specified attribute in the passed HTML document"""
377 parser = AttrParser(attribute, value)
380 except compat_html_parser.HTMLParseError:
382 return parser.get_result()
384 class MetaParser(BaseHTMLParser):
386 Modified HTMLParser that isolates a meta tag with the specified name
389 def __init__(self, name):
390 BaseHTMLParser.__init__(self)
395 def handle_starttag(self, tag, attrs):
399 if attrs.get('name') == self.name:
400 self.result = attrs.get('content')
402 def get_result(self):
405 def get_meta_content(name, html):
407 Return the content attribute from the meta tag with the given name attribute.
409 parser = MetaParser(name)
412 except compat_html_parser.HTMLParseError:
414 return parser.get_result()
417 def clean_html(html):
418 """Clean an HTML snippet into a readable string"""
420 html = html.replace('\n', ' ')
421 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
422 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
424 html = re.sub('<.*?>', '', html)
425 # Replace html entities
426 html = unescapeHTML(html)
430 def sanitize_open(filename, open_mode):
431 """Try to open the given filename, and slightly tweak it if this fails.
433 Attempts to open the given filename. If this fails, it tries to change
434 the filename slightly, step by step, until it's either able to open it
435 or it fails and raises a final exception, like the standard open()
438 It returns the tuple (stream, definitive_file_name).
442 if sys.platform == 'win32':
444 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
445 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
446 stream = open(encodeFilename(filename), open_mode)
447 return (stream, filename)
448 except (IOError, OSError) as err:
449 if err.errno in (errno.EACCES,):
452 # In case of error, try to remove win32 forbidden chars
453 alt_filename = os.path.join(
454 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
455 for path_part in os.path.split(filename)
457 if alt_filename == filename:
460 # An exception here should be caught in the caller
461 stream = open(encodeFilename(filename), open_mode)
462 return (stream, alt_filename)
465 def timeconvert(timestr):
466 """Convert RFC 2822 defined time string into system timestamp"""
468 timetuple = email.utils.parsedate_tz(timestr)
469 if timetuple is not None:
470 timestamp = email.utils.mktime_tz(timetuple)
473 def sanitize_filename(s, restricted=False, is_id=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
478 def replace_insane(char):
479 if char == '?' or ord(char) < 32 or ord(char) == 127:
482 return '' if restricted else '\''
484 return '_-' if restricted else ' -'
485 elif char in '\\/|*<>':
487 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
489 if restricted and ord(char) > 127:
493 result = u''.join(map(replace_insane, s))
495 while '__' in result:
496 result = result.replace('__', '_')
497 result = result.strip('_')
498 # Common case of "Foreign band name - English song title"
499 if restricted and result.startswith('-_'):
505 def orderedSet(iterable):
506 """ Remove all duplicates from the input iterable """
517 assert type(s) == compat_str
519 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
523 def encodeFilename(s, for_subprocess=False):
525 @param s The name of the file
528 assert type(s) == compat_str
530 # Python 3 has a Unicode API
531 if sys.version_info >= (3, 0):
534 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
535 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
536 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
537 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
538 if not for_subprocess:
541 # For subprocess calls, encode with locale encoding
542 # Refer to http://stackoverflow.com/a/9951851/35070
543 encoding = preferredencoding()
545 encoding = sys.getfilesystemencoding()
548 return s.encode(encoding, 'ignore')
551 def encodeArgument(s):
552 if not isinstance(s, compat_str):
553 # Legacy code that uses byte strings
554 # Uncomment the following line after fixing all post processors
555 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
556 s = s.decode('ascii')
557 return encodeFilename(s, True)
560 def decodeOption(optval):
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
566 assert isinstance(optval, compat_str)
569 def formatSeconds(secs):
571 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
573 return '%d:%02d' % (secs // 60, secs % 60)
578 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
579 if sys.version_info < (3, 2):
582 class HTTPSConnectionV3(httplib.HTTPSConnection):
583 def __init__(self, *args, **kwargs):
584 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
587 sock = socket.create_connection((self.host, self.port), self.timeout)
588 if getattr(self, '_tunnel_host', False):
592 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
594 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
596 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
597 def https_open(self, req):
598 return self.do_open(HTTPSConnectionV3, req)
599 return HTTPSHandlerV3(**kwargs)
601 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
602 context.verify_mode = (ssl.CERT_NONE
603 if opts_no_check_certificate
604 else ssl.CERT_REQUIRED)
605 context.set_default_verify_paths()
607 context.load_default_certs()
608 except AttributeError:
610 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
612 class ExtractorError(Exception):
613 """Error during info extraction."""
614 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
615 """ tb, if given, is the original traceback (so that it can be printed out).
616 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
619 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
621 if video_id is not None:
622 msg = video_id + ': ' + msg
624 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
625 super(ExtractorError, self).__init__(msg)
628 self.exc_info = sys.exc_info() # preserve original exception
630 self.video_id = video_id
632 def format_traceback(self):
633 if self.traceback is None:
635 return u''.join(traceback.format_tb(self.traceback))
638 class RegexNotFoundError(ExtractorError):
639 """Error when a regex didn't match"""
643 class DownloadError(Exception):
644 """Download Error exception.
646 This exception may be thrown by FileDownloader objects if they are not
647 configured to continue on errors. They will contain the appropriate
650 def __init__(self, msg, exc_info=None):
651 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
652 super(DownloadError, self).__init__(msg)
653 self.exc_info = exc_info
656 class SameFileError(Exception):
657 """Same File exception.
659 This exception will be thrown by FileDownloader objects if they detect
660 multiple files would have to be downloaded to the same file on disk.
665 class PostProcessingError(Exception):
666 """Post Processing exception.
668 This exception may be raised by PostProcessor's .run() method to
669 indicate an error in the postprocessing task.
671 def __init__(self, msg):
674 class MaxDownloadsReached(Exception):
675 """ --max-downloads limit has been reached. """
679 class UnavailableVideoError(Exception):
680 """Unavailable Format exception.
682 This exception will be thrown when a video is requested
683 in a format that is not available for that video.
688 class ContentTooShortError(Exception):
689 """Content Too Short exception.
691 This exception may be raised by FileDownloader objects when a file they
692 download is too small for what the server announced first, indicating
693 the connection was probably interrupted.
699 def __init__(self, downloaded, expected):
700 self.downloaded = downloaded
701 self.expected = expected
703 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
704 """Handler for HTTP requests and responses.
706 This class, when installed with an OpenerDirector, automatically adds
707 the standard headers to every HTTP request and handles gzipped and
708 deflated responses from web servers. If compression is to be avoided in
709 a particular request, the original request in the program code only has
710 to include the HTTP header "Youtubedl-No-Compression", which will be
711 removed before making the real request.
713 Part of this code was copied from:
715 http://techknack.net/python-urllib2-handlers/
717 Andrew Rowls, the author of that code, agreed to release it to the
724 return zlib.decompress(data, -zlib.MAX_WBITS)
726 return zlib.decompress(data)
729 def addinfourl_wrapper(stream, headers, url, code):
730 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
731 return compat_urllib_request.addinfourl(stream, headers, url, code)
732 ret = compat_urllib_request.addinfourl(stream, headers, url)
736 def http_request(self, req):
737 for h,v in std_headers.items():
741 if 'Youtubedl-no-compression' in req.headers:
742 if 'Accept-encoding' in req.headers:
743 del req.headers['Accept-encoding']
744 del req.headers['Youtubedl-no-compression']
745 if 'Youtubedl-user-agent' in req.headers:
746 if 'User-agent' in req.headers:
747 del req.headers['User-agent']
748 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
749 del req.headers['Youtubedl-user-agent']
752 def http_response(self, req, resp):
755 if resp.headers.get('Content-encoding', '') == 'gzip':
756 content = resp.read()
757 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
759 uncompressed = io.BytesIO(gz.read())
760 except IOError as original_ioerror:
761 # There may be junk add the end of the file
762 # See http://stackoverflow.com/q/4928560/35070 for details
763 for i in range(1, 1024):
765 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
766 uncompressed = io.BytesIO(gz.read())
771 raise original_ioerror
772 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
773 resp.msg = old_resp.msg
775 if resp.headers.get('Content-encoding', '') == 'deflate':
776 gz = io.BytesIO(self.deflate(resp.read()))
777 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
778 resp.msg = old_resp.msg
781 https_request = http_request
782 https_response = http_response
785 def parse_iso8601(date_str, delimiter='T'):
786 """ Return a UNIX timestamp from the given date """
792 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
795 timezone = datetime.timedelta()
797 date_str = date_str[:-len(m.group(0))]
798 if not m.group('sign'):
799 timezone = datetime.timedelta()
801 sign = 1 if m.group('sign') == '+' else -1
802 timezone = datetime.timedelta(
803 hours=sign * int(m.group('hours')),
804 minutes=sign * int(m.group('minutes')))
805 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
806 dt = datetime.datetime.strptime(date_str, date_format) - timezone
807 return calendar.timegm(dt.timetuple())
810 def unified_strdate(date_str):
811 """Return a string with the date in the format YYYYMMDD"""
818 date_str = date_str.replace(',', ' ')
819 # %z (UTC offset) is only supported in python>=3.2
820 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
821 format_expressions = [
826 '%b %dst %Y %I:%M%p',
827 '%b %dnd %Y %I:%M%p',
828 '%b %dth %Y %I:%M%p',
836 '%Y-%m-%dT%H:%M:%SZ',
837 '%Y-%m-%dT%H:%M:%S.%fZ',
838 '%Y-%m-%dT%H:%M:%S.%f0Z',
840 '%Y-%m-%dT%H:%M:%S.%f',
843 for expression in format_expressions:
845 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
848 if upload_date is None:
849 timetuple = email.utils.parsedate_tz(date_str)
851 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
854 def determine_ext(url, default_ext=u'unknown_video'):
857 guess = url.partition(u'?')[0].rpartition(u'.')[2]
858 if re.match(r'^[A-Za-z0-9]+$', guess):
863 def subtitles_filename(filename, sub_lang, sub_format):
864 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
866 def date_from_str(date_str):
868 Return a datetime object from a string in the format YYYYMMDD or
869 (now|today)[+-][0-9](day|week|month|year)(s)?"""
870 today = datetime.date.today()
871 if date_str == 'now'or date_str == 'today':
873 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
874 if match is not None:
875 sign = match.group('sign')
876 time = int(match.group('time'))
879 unit = match.group('unit')
888 delta = datetime.timedelta(**{unit: time})
890 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
892 def hyphenate_date(date_str):
894 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
895 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
896 if match is not None:
897 return '-'.join(match.groups())
901 class DateRange(object):
902 """Represents a time interval between two dates"""
903 def __init__(self, start=None, end=None):
904 """start and end must be strings in the format accepted by date"""
905 if start is not None:
906 self.start = date_from_str(start)
908 self.start = datetime.datetime.min.date()
910 self.end = date_from_str(end)
912 self.end = datetime.datetime.max.date()
913 if self.start > self.end:
914 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
917 """Returns a range that only contains the given day"""
919 def __contains__(self, date):
920 """Check if the date is in the range"""
921 if not isinstance(date, datetime.date):
922 date = date_from_str(date)
923 return self.start <= date <= self.end
925 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
929 """ Returns the platform name as a compat_str """
930 res = platform.platform()
931 if isinstance(res, bytes):
932 res = res.decode(preferredencoding())
934 assert isinstance(res, compat_str)
938 def _windows_write_string(s, out):
939 """ Returns True if the string was written using special methods,
940 False if it has yet to be written out."""
941 # Adapted from http://stackoverflow.com/a/3259271/35070
944 import ctypes.wintypes
952 fileno = out.fileno()
953 except AttributeError:
954 # If the output stream doesn't have a fileno, it's virtual
956 if fileno not in WIN_OUTPUT_IDS:
959 GetStdHandle = ctypes.WINFUNCTYPE(
960 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
961 ("GetStdHandle", ctypes.windll.kernel32))
962 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
964 WriteConsoleW = ctypes.WINFUNCTYPE(
965 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
966 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
967 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
968 written = ctypes.wintypes.DWORD(0)
970 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
971 FILE_TYPE_CHAR = 0x0002
972 FILE_TYPE_REMOTE = 0x8000
973 GetConsoleMode = ctypes.WINFUNCTYPE(
974 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
975 ctypes.POINTER(ctypes.wintypes.DWORD))(
976 ("GetConsoleMode", ctypes.windll.kernel32))
977 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
979 def not_a_console(handle):
980 if handle == INVALID_HANDLE_VALUE or handle is None:
982 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
983 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
988 def next_nonbmp_pos(s):
990 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
991 except StopIteration:
995 count = min(next_nonbmp_pos(s), 1024)
998 h, s, count if count else 2, ctypes.byref(written), None)
1000 raise OSError('Failed to write string')
1001 if not count: # We just wrote a non-BMP character
1002 assert written.value == 2
1005 assert written.value > 0
1006 s = s[written.value:]
1010 def write_string(s, out=None, encoding=None):
1013 assert type(s) == compat_str
1015 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1016 if _windows_write_string(s, out):
1019 if ('b' in getattr(out, 'mode', '') or
1020 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1021 byt = s.encode(encoding or preferredencoding(), 'ignore')
1023 elif hasattr(out, 'buffer'):
1024 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1025 byt = s.encode(enc, 'ignore')
1026 out.buffer.write(byt)
1032 def bytes_to_intlist(bs):
1035 if isinstance(bs[0], int): # Python 3
1038 return [ord(c) for c in bs]
1041 def intlist_to_bytes(xs):
1044 if isinstance(chr(0), bytes): # Python 2
1045 return ''.join([chr(x) for x in xs])
1050 def get_cachedir(params={}):
1051 cache_root = os.environ.get('XDG_CACHE_HOME',
1052 os.path.expanduser('~/.cache'))
1053 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1056 # Cross-platform file locking
1057 if sys.platform == 'win32':
1058 import ctypes.wintypes
1061 class OVERLAPPED(ctypes.Structure):
1063 ('Internal', ctypes.wintypes.LPVOID),
1064 ('InternalHigh', ctypes.wintypes.LPVOID),
1065 ('Offset', ctypes.wintypes.DWORD),
1066 ('OffsetHigh', ctypes.wintypes.DWORD),
1067 ('hEvent', ctypes.wintypes.HANDLE),
1070 kernel32 = ctypes.windll.kernel32
1071 LockFileEx = kernel32.LockFileEx
1072 LockFileEx.argtypes = [
1073 ctypes.wintypes.HANDLE, # hFile
1074 ctypes.wintypes.DWORD, # dwFlags
1075 ctypes.wintypes.DWORD, # dwReserved
1076 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1077 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1078 ctypes.POINTER(OVERLAPPED) # Overlapped
1080 LockFileEx.restype = ctypes.wintypes.BOOL
1081 UnlockFileEx = kernel32.UnlockFileEx
1082 UnlockFileEx.argtypes = [
1083 ctypes.wintypes.HANDLE, # hFile
1084 ctypes.wintypes.DWORD, # dwReserved
1085 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1086 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1087 ctypes.POINTER(OVERLAPPED) # Overlapped
1089 UnlockFileEx.restype = ctypes.wintypes.BOOL
1090 whole_low = 0xffffffff
1091 whole_high = 0x7fffffff
1093 def _lock_file(f, exclusive):
1094 overlapped = OVERLAPPED()
1095 overlapped.Offset = 0
1096 overlapped.OffsetHigh = 0
1097 overlapped.hEvent = 0
1098 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1099 handle = msvcrt.get_osfhandle(f.fileno())
1100 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1101 whole_low, whole_high, f._lock_file_overlapped_p):
1102 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1104 def _unlock_file(f):
1105 assert f._lock_file_overlapped_p
1106 handle = msvcrt.get_osfhandle(f.fileno())
1107 if not UnlockFileEx(handle, 0,
1108 whole_low, whole_high, f._lock_file_overlapped_p):
1109 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1114 def _lock_file(f, exclusive):
1115 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1117 def _unlock_file(f):
1118 fcntl.lockf(f, fcntl.LOCK_UN)
1121 class locked_file(object):
1122 def __init__(self, filename, mode, encoding=None):
1123 assert mode in ['r', 'a', 'w']
1124 self.f = io.open(filename, mode, encoding=encoding)
1127 def __enter__(self):
1128 exclusive = self.mode != 'r'
1130 _lock_file(self.f, exclusive)
1136 def __exit__(self, etype, value, traceback):
1138 _unlock_file(self.f)
1145 def write(self, *args):
1146 return self.f.write(*args)
1148 def read(self, *args):
1149 return self.f.read(*args)
1152 def shell_quote(args):
1154 encoding = sys.getfilesystemencoding()
1155 if encoding is None:
1158 if isinstance(a, bytes):
1159 # We may get a filename encoded with 'encodeFilename'
1160 a = a.decode(encoding)
1161 quoted_args.append(pipes.quote(a))
1162 return u' '.join(quoted_args)
1165 def takewhile_inclusive(pred, seq):
1166 """ Like itertools.takewhile, but include the latest evaluated element
1167 (the first element so that Not pred(e)) """
1174 def smuggle_url(url, data):
1175 """ Pass additional data in a URL for internal use. """
1177 sdata = compat_urllib_parse.urlencode(
1178 {u'__youtubedl_smuggle': json.dumps(data)})
1179 return url + u'#' + sdata
1182 def unsmuggle_url(smug_url, default=None):
1183 if not '#__youtubedl_smuggle' in smug_url:
1184 return smug_url, default
1185 url, _, sdata = smug_url.rpartition(u'#')
1186 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1187 data = json.loads(jsond)
1191 def format_bytes(bytes):
1194 if type(bytes) is str:
1195 bytes = float(bytes)
1199 exponent = int(math.log(bytes, 1024.0))
1200 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1201 converted = float(bytes) / float(1024 ** exponent)
1202 return u'%.2f%s' % (converted, suffix)
1205 def get_term_width():
1206 columns = os.environ.get('COLUMNS', None)
1211 sp = subprocess.Popen(
1213 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1214 out, err = sp.communicate()
1215 return int(out.split()[1])
1221 def month_by_name(name):
1222 """ Return the number of a month by (locale-independently) English name """
1225 u'January', u'February', u'March', u'April', u'May', u'June',
1226 u'July', u'August', u'September', u'October', u'November', u'December']
1228 return ENGLISH_NAMES.index(name) + 1
1233 def fix_xml_ampersands(xml_str):
1234 """Replace all the '&' by '&' in XML"""
1236 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1241 def setproctitle(title):
1242 assert isinstance(title, compat_str)
1244 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1247 title_bytes = title.encode('utf-8')
1248 buf = ctypes.create_string_buffer(len(title_bytes))
1249 buf.value = title_bytes
1251 libc.prctl(15, buf, 0, 0, 0)
1252 except AttributeError:
1253 return # Strange libc, just skip this
1256 def remove_start(s, start):
1257 if s.startswith(start):
1258 return s[len(start):]
1262 def url_basename(url):
1263 path = compat_urlparse.urlparse(url).path
1264 return path.strip(u'/').split(u'/')[-1]
1267 class HEADRequest(compat_urllib_request.Request):
1268 def get_method(self):
1272 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1275 v = getattr(v, get_attr, None)
1278 return default if v is None else (int(v) * invscale // scale)
1281 def str_or_none(v, default=None):
1282 return default if v is None else compat_str(v)
1285 def str_to_int(int_str):
1288 int_str = re.sub(r'[,\.]', u'', int_str)
1292 def float_or_none(v, scale=1, invscale=1, default=None):
1293 return default if v is None else (float(v) * invscale / scale)
1296 def parse_duration(s):
1301 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1304 res = int(m.group('secs'))
1306 res += int(m.group('mins')) * 60
1307 if m.group('hours'):
1308 res += int(m.group('hours')) * 60 * 60
1312 def prepend_extension(filename, ext):
1313 name, real_ext = os.path.splitext(filename)
1314 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1317 def check_executable(exe, args=[]):
1318 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1319 args can be a list of arguments for a short output (like -version) """
1321 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1327 class PagedList(object):
1328 def __init__(self, pagefunc, pagesize):
1329 self._pagefunc = pagefunc
1330 self._pagesize = pagesize
1333 # This is only useful for tests
1334 return len(self.getslice())
1336 def getslice(self, start=0, end=None):
1338 for pagenum in itertools.count(start // self._pagesize):
1339 firstid = pagenum * self._pagesize
1340 nextfirstid = pagenum * self._pagesize + self._pagesize
1341 if start >= nextfirstid:
1344 page_results = list(self._pagefunc(pagenum))
1347 start % self._pagesize
1348 if firstid <= start < nextfirstid
1352 ((end - 1) % self._pagesize) + 1
1353 if (end is not None and firstid <= end <= nextfirstid)
1356 if startv != 0 or endv is not None:
1357 page_results = page_results[startv:endv]
1358 res.extend(page_results)
1360 # A little optimization - if current page is not "full", ie. does
1361 # not contain page_size videos then we can assume that this page
1362 # is the last one - there are no more ids on further pages -
1363 # i.e. no need to query again.
1364 if len(page_results) + startv < self._pagesize:
1367 # If we got the whole page, but the next page is not interesting,
1368 # break out early as well
1369 if end == nextfirstid:
1374 def uppercase_escape(s):
1375 unicode_escape = codecs.getdecoder('unicode_escape')
1377 r'\\U[0-9a-fA-F]{8}',
1378 lambda m: unicode_escape(m.group(0))[0],
1382 struct.pack(u'!I', 0)
1384 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1385 def struct_pack(spec, *args):
1386 if isinstance(spec, compat_str):
1387 spec = spec.encode('ascii')
1388 return struct.pack(spec, *args)
1390 def struct_unpack(spec, *args):
1391 if isinstance(spec, compat_str):
1392 spec = spec.encode('ascii')
1393 return struct.unpack(spec, *args)
1395 struct_pack = struct.pack
1396 struct_unpack = struct.unpack
1399 def read_batch_urls(batch_fd):
1401 if not isinstance(url, compat_str):
1402 url = url.decode('utf-8', 'replace')
1403 BOM_UTF8 = u'\xef\xbb\xbf'
1404 if url.startswith(BOM_UTF8):
1405 url = url[len(BOM_UTF8):]
1407 if url.startswith(('#', ';', ']')):
1411 with contextlib.closing(batch_fd) as fd:
1412 return [url for url in map(fixup, fd) if url]
1415 def urlencode_postdata(*args, **kargs):
1416 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1420 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1421 def doctype(self, name, pubid, system):
1422 pass # Ignore doctypes
1424 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1425 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1426 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1429 if sys.version_info < (3, 0) and sys.platform == 'win32':
1430 def compat_getpass(prompt, *args, **kwargs):
1431 if isinstance(prompt, compat_str):
1432 prompt = prompt.encode(preferredencoding())
1433 return getpass.getpass(prompt, *args, **kwargs)
1435 compat_getpass = getpass.getpass
1447 def strip_jsonp(code):
1448 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1451 def qualities(quality_ids):
1452 """ Get a numeric quality value out of a list of possible values """
1455 return quality_ids.index(qid)
1461 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1464 subprocess_check_output = subprocess.check_output
1465 except AttributeError:
1466 def subprocess_check_output(*args, **kwargs):
1467 assert 'input' not in kwargs
1468 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1469 output, _ = p.communicate()
1472 raise subprocess.CalledProcessError(ret, p.args, output=output)