2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
94 from urllib.parse import unquote as compat_urllib_parse_unquote
96 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
99 res = string.split('%')
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
113 pct_sequence += item[:2].decode('hex')
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
122 # Encountered non-percent-encoded characters. Flush the current
124 string += pct_sequence.decode(encoding, errors) + rest
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
133 from urllib.parse import parse_qs as compat_parse_qs
134 except ImportError: # Python 2
135 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
136 # Python 2's version is apparently totally broken
138 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
139 encoding='utf-8', errors='replace'):
140 qs, _coerce_result = qs, unicode
141 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 for name_value in pairs:
144 if not name_value and not strict_parsing:
146 nv = name_value.split('=', 1)
149 raise ValueError("bad query field: %r" % (name_value,))
150 # Handle case of a control-name with no equal sign
151 if keep_blank_values:
155 if len(nv[1]) or keep_blank_values:
156 name = nv[0].replace('+', ' ')
157 name = compat_urllib_parse_unquote(
158 name, encoding=encoding, errors=errors)
159 name = _coerce_result(name)
160 value = nv[1].replace('+', ' ')
161 value = compat_urllib_parse_unquote(
162 value, encoding=encoding, errors=errors)
163 value = _coerce_result(value)
164 r.append((name, value))
167 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
168 encoding='utf-8', errors='replace'):
170 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
171 encoding=encoding, errors=errors)
172 for name, value in pairs:
173 if name in parsed_result:
174 parsed_result[name].append(value)
176 parsed_result[name] = [value]
180 compat_str = unicode # Python 2
185 compat_chr = unichr # Python 2
190 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
191 except ImportError: # Python 2.6
192 from xml.parsers.expat import ExpatError as compat_xml_parse_error
195 if type(c) is int: return c
198 # This is not clearly defined otherwise
199 compiled_regex_type = type(re.compile(''))
202 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
203 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
204 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
205 'Accept-Encoding': 'gzip, deflate',
206 'Accept-Language': 'en-us,en;q=0.5',
209 def preferredencoding():
210 """Get preferred encoding.
212 Returns the best encoding scheme for the system, based on
213 locale.getpreferredencoding() and some further tweaks.
216 pref = locale.getpreferredencoding()
223 if sys.version_info < (3,0):
225 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
228 assert type(s) == type(u'')
231 # In Python 2.x, json.dump expects a bytestream.
232 # In Python 3.x, it writes to a character stream
233 if sys.version_info < (3,0):
234 def write_json_file(obj, fn):
235 with open(fn, 'wb') as f:
238 def write_json_file(obj, fn):
239 with open(fn, 'w', encoding='utf-8') as f:
242 if sys.version_info >= (2,7):
243 def find_xpath_attr(node, xpath, key, val):
244 """ Find the xpath xpath[@key=val] """
245 assert re.match(r'^[a-zA-Z]+$', key)
246 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
247 expr = xpath + u"[@%s='%s']" % (key, val)
248 return node.find(expr)
250 def find_xpath_attr(node, xpath, key, val):
251 for f in node.findall(xpath):
252 if f.attrib.get(key) == val:
256 # On python2.6 the xml.etree.ElementTree.Element methods don't support
257 # the namespace parameter
258 def xpath_with_ns(path, ns_map):
259 components = [c.split(':') for c in path.split('/')]
263 replaced.append(c[0])
266 replaced.append('{%s}%s' % (ns_map[ns], tag))
267 return '/'.join(replaced)
269 def htmlentity_transform(matchobj):
270 """Transforms an HTML entity to a character.
272 This function receives a match object and is intended to be used with
273 the re.sub() function.
275 entity = matchobj.group(1)
277 # Known non-numeric HTML entity
278 if entity in compat_html_entities.name2codepoint:
279 return compat_chr(compat_html_entities.name2codepoint[entity])
281 mobj = re.match(u'(?u)#(x?\\d+)', entity)
283 numstr = mobj.group(1)
284 if numstr.startswith(u'x'):
286 numstr = u'0%s' % numstr
289 return compat_chr(int(numstr, base))
291 # Unknown entity in name, return its literal representation
292 return (u'&%s;' % entity)
294 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
295 class BaseHTMLParser(compat_html_parser.HTMLParser):
297 compat_html_parser.HTMLParser.__init__(self)
300 def loads(self, html):
305 class AttrParser(BaseHTMLParser):
306 """Modified HTMLParser that isolates a tag with the specified attribute"""
307 def __init__(self, attribute, value):
308 self.attribute = attribute
313 self.watch_startpos = False
315 BaseHTMLParser.__init__(self)
317 def error(self, message):
318 if self.error_count > 10 or self.started:
319 raise compat_html_parser.HTMLParseError(message, self.getpos())
320 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
321 self.error_count += 1
324 def handle_starttag(self, tag, attrs):
327 self.find_startpos(None)
328 if self.attribute in attrs and attrs[self.attribute] == self.value:
331 self.watch_startpos = True
333 if not tag in self.depth: self.depth[tag] = 0
336 def handle_endtag(self, tag):
338 if tag in self.depth: self.depth[tag] -= 1
339 if self.depth[self.result[0]] == 0:
341 self.result.append(self.getpos())
343 def find_startpos(self, x):
344 """Needed to put the start position of the result (self.result[1])
345 after the opening tag with the requested id"""
346 if self.watch_startpos:
347 self.watch_startpos = False
348 self.result.append(self.getpos())
349 handle_entityref = handle_charref = handle_data = handle_comment = \
350 handle_decl = handle_pi = unknown_decl = find_startpos
352 def get_result(self):
353 if self.result is None:
355 if len(self.result) != 3:
357 lines = self.html.split('\n')
358 lines = lines[self.result[1][0]-1:self.result[2][0]]
359 lines[0] = lines[0][self.result[1][1]:]
361 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
362 lines[-1] = lines[-1][:self.result[2][1]]
363 return '\n'.join(lines).strip()
364 # Hack for https://github.com/rg3/youtube-dl/issues/662
365 if sys.version_info < (2, 7, 3):
366 AttrParser.parse_endtag = (lambda self, i:
367 i + len("</scr'+'ipt>")
368 if self.rawdata[i:].startswith("</scr'+'ipt>")
369 else compat_html_parser.HTMLParser.parse_endtag(self, i))
371 def get_element_by_id(id, html):
372 """Return the content of the tag with the specified ID in the passed HTML document"""
373 return get_element_by_attribute("id", id, html)
375 def get_element_by_attribute(attribute, value, html):
376 """Return the content of the tag with the specified attribute in the passed HTML document"""
377 parser = AttrParser(attribute, value)
380 except compat_html_parser.HTMLParseError:
382 return parser.get_result()
384 class MetaParser(BaseHTMLParser):
386 Modified HTMLParser that isolates a meta tag with the specified name
389 def __init__(self, name):
390 BaseHTMLParser.__init__(self)
395 def handle_starttag(self, tag, attrs):
399 if attrs.get('name') == self.name:
400 self.result = attrs.get('content')
402 def get_result(self):
405 def get_meta_content(name, html):
407 Return the content attribute from the meta tag with the given name attribute.
409 parser = MetaParser(name)
412 except compat_html_parser.HTMLParseError:
414 return parser.get_result()
417 def clean_html(html):
418 """Clean an HTML snippet into a readable string"""
420 html = html.replace('\n', ' ')
421 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
422 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
424 html = re.sub('<.*?>', '', html)
425 # Replace html entities
426 html = unescapeHTML(html)
430 def sanitize_open(filename, open_mode):
431 """Try to open the given filename, and slightly tweak it if this fails.
433 Attempts to open the given filename. If this fails, it tries to change
434 the filename slightly, step by step, until it's either able to open it
435 or it fails and raises a final exception, like the standard open()
438 It returns the tuple (stream, definitive_file_name).
442 if sys.platform == 'win32':
444 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
445 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
446 stream = open(encodeFilename(filename), open_mode)
447 return (stream, filename)
448 except (IOError, OSError) as err:
449 if err.errno in (errno.EACCES,):
452 # In case of error, try to remove win32 forbidden chars
453 alt_filename = os.path.join(
454 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
455 for path_part in os.path.split(filename)
457 if alt_filename == filename:
460 # An exception here should be caught in the caller
461 stream = open(encodeFilename(filename), open_mode)
462 return (stream, alt_filename)
465 def timeconvert(timestr):
466 """Convert RFC 2822 defined time string into system timestamp"""
468 timetuple = email.utils.parsedate_tz(timestr)
469 if timetuple is not None:
470 timestamp = email.utils.mktime_tz(timetuple)
473 def sanitize_filename(s, restricted=False, is_id=False):
474 """Sanitizes a string so it could be used as part of a filename.
475 If restricted is set, use a stricter subset of allowed characters.
476 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
478 def replace_insane(char):
479 if char == '?' or ord(char) < 32 or ord(char) == 127:
482 return '' if restricted else '\''
484 return '_-' if restricted else ' -'
485 elif char in '\\/|*<>':
487 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
489 if restricted and ord(char) > 127:
493 result = u''.join(map(replace_insane, s))
495 while '__' in result:
496 result = result.replace('__', '_')
497 result = result.strip('_')
498 # Common case of "Foreign band name - English song title"
499 if restricted and result.startswith('-_'):
505 def orderedSet(iterable):
506 """ Remove all duplicates from the input iterable """
517 assert type(s) == compat_str
519 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
523 def encodeFilename(s, for_subprocess=False):
525 @param s The name of the file
528 assert type(s) == compat_str
530 # Python 3 has a Unicode API
531 if sys.version_info >= (3, 0):
534 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
535 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
536 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
537 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
538 if not for_subprocess:
541 # For subprocess calls, encode with locale encoding
542 # Refer to http://stackoverflow.com/a/9951851/35070
543 encoding = preferredencoding()
545 encoding = sys.getfilesystemencoding()
548 return s.encode(encoding, 'ignore')
551 def encodeArgument(s):
552 if not isinstance(s, compat_str):
553 # Legacy code that uses byte strings
554 # Uncomment the following line after fixing all post processors
555 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
556 s = s.decode('ascii')
557 return encodeFilename(s, True)
560 def decodeOption(optval):
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
566 assert isinstance(optval, compat_str)
569 def formatSeconds(secs):
571 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
573 return '%d:%02d' % (secs // 60, secs % 60)
578 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
579 if sys.version_info < (3, 2):
582 class HTTPSConnectionV3(httplib.HTTPSConnection):
583 def __init__(self, *args, **kwargs):
584 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
587 sock = socket.create_connection((self.host, self.port), self.timeout)
588 if getattr(self, '_tunnel_host', False):
592 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
594 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
596 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
597 def https_open(self, req):
598 return self.do_open(HTTPSConnectionV3, req)
599 return HTTPSHandlerV3(**kwargs)
601 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
602 context.verify_mode = (ssl.CERT_NONE
603 if opts_no_check_certificate
604 else ssl.CERT_REQUIRED)
605 context.set_default_verify_paths()
607 context.load_default_certs()
608 except AttributeError:
610 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
612 class ExtractorError(Exception):
613 """Error during info extraction."""
614 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
615 """ tb, if given, is the original traceback (so that it can be printed out).
616 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
619 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
621 if video_id is not None:
622 msg = video_id + ': ' + msg
624 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
625 super(ExtractorError, self).__init__(msg)
628 self.exc_info = sys.exc_info() # preserve original exception
630 self.video_id = video_id
632 def format_traceback(self):
633 if self.traceback is None:
635 return u''.join(traceback.format_tb(self.traceback))
638 class RegexNotFoundError(ExtractorError):
639 """Error when a regex didn't match"""
643 class DownloadError(Exception):
644 """Download Error exception.
646 This exception may be thrown by FileDownloader objects if they are not
647 configured to continue on errors. They will contain the appropriate
650 def __init__(self, msg, exc_info=None):
651 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
652 super(DownloadError, self).__init__(msg)
653 self.exc_info = exc_info
656 class SameFileError(Exception):
657 """Same File exception.
659 This exception will be thrown by FileDownloader objects if they detect
660 multiple files would have to be downloaded to the same file on disk.
665 class PostProcessingError(Exception):
666 """Post Processing exception.
668 This exception may be raised by PostProcessor's .run() method to
669 indicate an error in the postprocessing task.
671 def __init__(self, msg):
674 class MaxDownloadsReached(Exception):
675 """ --max-downloads limit has been reached. """
679 class UnavailableVideoError(Exception):
680 """Unavailable Format exception.
682 This exception will be thrown when a video is requested
683 in a format that is not available for that video.
688 class ContentTooShortError(Exception):
689 """Content Too Short exception.
691 This exception may be raised by FileDownloader objects when a file they
692 download is too small for what the server announced first, indicating
693 the connection was probably interrupted.
699 def __init__(self, downloaded, expected):
700 self.downloaded = downloaded
701 self.expected = expected
703 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
704 """Handler for HTTP requests and responses.
706 This class, when installed with an OpenerDirector, automatically adds
707 the standard headers to every HTTP request and handles gzipped and
708 deflated responses from web servers. If compression is to be avoided in
709 a particular request, the original request in the program code only has
710 to include the HTTP header "Youtubedl-No-Compression", which will be
711 removed before making the real request.
713 Part of this code was copied from:
715 http://techknack.net/python-urllib2-handlers/
717 Andrew Rowls, the author of that code, agreed to release it to the
724 return zlib.decompress(data, -zlib.MAX_WBITS)
726 return zlib.decompress(data)
729 def addinfourl_wrapper(stream, headers, url, code):
730 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
731 return compat_urllib_request.addinfourl(stream, headers, url, code)
732 ret = compat_urllib_request.addinfourl(stream, headers, url)
736 def http_request(self, req):
737 for h,v in std_headers.items():
741 if 'Youtubedl-no-compression' in req.headers:
742 if 'Accept-encoding' in req.headers:
743 del req.headers['Accept-encoding']
744 del req.headers['Youtubedl-no-compression']
745 if 'Youtubedl-user-agent' in req.headers:
746 if 'User-agent' in req.headers:
747 del req.headers['User-agent']
748 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
749 del req.headers['Youtubedl-user-agent']
752 def http_response(self, req, resp):
755 if resp.headers.get('Content-encoding', '') == 'gzip':
756 content = resp.read()
757 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
759 uncompressed = io.BytesIO(gz.read())
760 except IOError as original_ioerror:
761 # There may be junk add the end of the file
762 # See http://stackoverflow.com/q/4928560/35070 for details
763 for i in range(1, 1024):
765 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
766 uncompressed = io.BytesIO(gz.read())
771 raise original_ioerror
772 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
773 resp.msg = old_resp.msg
775 if resp.headers.get('Content-encoding', '') == 'deflate':
776 gz = io.BytesIO(self.deflate(resp.read()))
777 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
778 resp.msg = old_resp.msg
781 https_request = http_request
782 https_response = http_response
785 def parse_iso8601(date_str, delimiter='T'):
786 """ Return a UNIX timestamp from the given date """
792 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
795 timezone = datetime.timedelta()
797 date_str = date_str[:-len(m.group(0))]
798 if not m.group('sign'):
799 timezone = datetime.timedelta()
801 sign = 1 if m.group('sign') == '+' else -1
802 timezone = datetime.timedelta(
803 hours=sign * int(m.group('hours')),
804 minutes=sign * int(m.group('minutes')))
805 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
806 dt = datetime.datetime.strptime(date_str, date_format) - timezone
807 return calendar.timegm(dt.timetuple())
810 def unified_strdate(date_str):
811 """Return a string with the date in the format YYYYMMDD"""
818 date_str = date_str.replace(',', ' ')
819 # %z (UTC offset) is only supported in python>=3.2
820 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
821 format_expressions = [
826 '%b %dst %Y %I:%M%p',
827 '%b %dnd %Y %I:%M%p',
828 '%b %dth %Y %I:%M%p',
836 '%Y-%m-%dT%H:%M:%SZ',
837 '%Y-%m-%dT%H:%M:%S.%fZ',
838 '%Y-%m-%dT%H:%M:%S.%f0Z',
840 '%Y-%m-%dT%H:%M:%S.%f',
843 for expression in format_expressions:
845 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
848 if upload_date is None:
849 timetuple = email.utils.parsedate_tz(date_str)
851 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
854 def determine_ext(url, default_ext=u'unknown_video'):
855 guess = url.partition(u'?')[0].rpartition(u'.')[2]
856 if re.match(r'^[A-Za-z0-9]+$', guess):
861 def subtitles_filename(filename, sub_lang, sub_format):
862 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
864 def date_from_str(date_str):
866 Return a datetime object from a string in the format YYYYMMDD or
867 (now|today)[+-][0-9](day|week|month|year)(s)?"""
868 today = datetime.date.today()
869 if date_str == 'now'or date_str == 'today':
871 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
872 if match is not None:
873 sign = match.group('sign')
874 time = int(match.group('time'))
877 unit = match.group('unit')
886 delta = datetime.timedelta(**{unit: time})
888 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
890 def hyphenate_date(date_str):
892 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
893 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
894 if match is not None:
895 return '-'.join(match.groups())
899 class DateRange(object):
900 """Represents a time interval between two dates"""
901 def __init__(self, start=None, end=None):
902 """start and end must be strings in the format accepted by date"""
903 if start is not None:
904 self.start = date_from_str(start)
906 self.start = datetime.datetime.min.date()
908 self.end = date_from_str(end)
910 self.end = datetime.datetime.max.date()
911 if self.start > self.end:
912 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
915 """Returns a range that only contains the given day"""
917 def __contains__(self, date):
918 """Check if the date is in the range"""
919 if not isinstance(date, datetime.date):
920 date = date_from_str(date)
921 return self.start <= date <= self.end
923 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
927 """ Returns the platform name as a compat_str """
928 res = platform.platform()
929 if isinstance(res, bytes):
930 res = res.decode(preferredencoding())
932 assert isinstance(res, compat_str)
936 def _windows_write_string(s, out):
937 """ Returns True if the string was written using special methods,
938 False if it has yet to be written out."""
939 # Adapted from http://stackoverflow.com/a/3259271/35070
942 import ctypes.wintypes
950 fileno = out.fileno()
951 except AttributeError:
952 # If the output stream doesn't have a fileno, it's virtual
954 if fileno not in WIN_OUTPUT_IDS:
957 GetStdHandle = ctypes.WINFUNCTYPE(
958 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
959 ("GetStdHandle", ctypes.windll.kernel32))
960 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
962 WriteConsoleW = ctypes.WINFUNCTYPE(
963 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
964 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
965 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
966 written = ctypes.wintypes.DWORD(0)
968 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
969 FILE_TYPE_CHAR = 0x0002
970 FILE_TYPE_REMOTE = 0x8000
971 GetConsoleMode = ctypes.WINFUNCTYPE(
972 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
973 ctypes.POINTER(ctypes.wintypes.DWORD))(
974 ("GetConsoleMode", ctypes.windll.kernel32))
975 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
977 def not_a_console(handle):
978 if handle == INVALID_HANDLE_VALUE or handle is None:
980 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
981 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
986 def next_nonbmp_pos(s):
988 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
989 except StopIteration:
993 count = min(next_nonbmp_pos(s), 1024)
996 h, s, count if count else 2, ctypes.byref(written), None)
998 raise OSError('Failed to write string')
999 if not count: # We just wrote a non-BMP character
1000 assert written.value == 2
1003 assert written.value > 0
1004 s = s[written.value:]
1008 def write_string(s, out=None, encoding=None):
1011 assert type(s) == compat_str
1013 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1014 if _windows_write_string(s, out):
1017 if ('b' in getattr(out, 'mode', '') or
1018 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1019 byt = s.encode(encoding or preferredencoding(), 'ignore')
1021 elif hasattr(out, 'buffer'):
1022 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1023 byt = s.encode(enc, 'ignore')
1024 out.buffer.write(byt)
1030 def bytes_to_intlist(bs):
1033 if isinstance(bs[0], int): # Python 3
1036 return [ord(c) for c in bs]
1039 def intlist_to_bytes(xs):
1042 if isinstance(chr(0), bytes): # Python 2
1043 return ''.join([chr(x) for x in xs])
1048 def get_cachedir(params={}):
1049 cache_root = os.environ.get('XDG_CACHE_HOME',
1050 os.path.expanduser('~/.cache'))
1051 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1054 # Cross-platform file locking
1055 if sys.platform == 'win32':
1056 import ctypes.wintypes
1059 class OVERLAPPED(ctypes.Structure):
1061 ('Internal', ctypes.wintypes.LPVOID),
1062 ('InternalHigh', ctypes.wintypes.LPVOID),
1063 ('Offset', ctypes.wintypes.DWORD),
1064 ('OffsetHigh', ctypes.wintypes.DWORD),
1065 ('hEvent', ctypes.wintypes.HANDLE),
1068 kernel32 = ctypes.windll.kernel32
1069 LockFileEx = kernel32.LockFileEx
1070 LockFileEx.argtypes = [
1071 ctypes.wintypes.HANDLE, # hFile
1072 ctypes.wintypes.DWORD, # dwFlags
1073 ctypes.wintypes.DWORD, # dwReserved
1074 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1075 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1076 ctypes.POINTER(OVERLAPPED) # Overlapped
1078 LockFileEx.restype = ctypes.wintypes.BOOL
1079 UnlockFileEx = kernel32.UnlockFileEx
1080 UnlockFileEx.argtypes = [
1081 ctypes.wintypes.HANDLE, # hFile
1082 ctypes.wintypes.DWORD, # dwReserved
1083 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1084 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1085 ctypes.POINTER(OVERLAPPED) # Overlapped
1087 UnlockFileEx.restype = ctypes.wintypes.BOOL
1088 whole_low = 0xffffffff
1089 whole_high = 0x7fffffff
1091 def _lock_file(f, exclusive):
1092 overlapped = OVERLAPPED()
1093 overlapped.Offset = 0
1094 overlapped.OffsetHigh = 0
1095 overlapped.hEvent = 0
1096 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1097 handle = msvcrt.get_osfhandle(f.fileno())
1098 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1099 whole_low, whole_high, f._lock_file_overlapped_p):
1100 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1102 def _unlock_file(f):
1103 assert f._lock_file_overlapped_p
1104 handle = msvcrt.get_osfhandle(f.fileno())
1105 if not UnlockFileEx(handle, 0,
1106 whole_low, whole_high, f._lock_file_overlapped_p):
1107 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1112 def _lock_file(f, exclusive):
1113 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1115 def _unlock_file(f):
1116 fcntl.lockf(f, fcntl.LOCK_UN)
1119 class locked_file(object):
1120 def __init__(self, filename, mode, encoding=None):
1121 assert mode in ['r', 'a', 'w']
1122 self.f = io.open(filename, mode, encoding=encoding)
1125 def __enter__(self):
1126 exclusive = self.mode != 'r'
1128 _lock_file(self.f, exclusive)
1134 def __exit__(self, etype, value, traceback):
1136 _unlock_file(self.f)
1143 def write(self, *args):
1144 return self.f.write(*args)
1146 def read(self, *args):
1147 return self.f.read(*args)
1150 def shell_quote(args):
1152 encoding = sys.getfilesystemencoding()
1153 if encoding is None:
1156 if isinstance(a, bytes):
1157 # We may get a filename encoded with 'encodeFilename'
1158 a = a.decode(encoding)
1159 quoted_args.append(pipes.quote(a))
1160 return u' '.join(quoted_args)
1163 def takewhile_inclusive(pred, seq):
1164 """ Like itertools.takewhile, but include the latest evaluated element
1165 (the first element so that Not pred(e)) """
1172 def smuggle_url(url, data):
1173 """ Pass additional data in a URL for internal use. """
1175 sdata = compat_urllib_parse.urlencode(
1176 {u'__youtubedl_smuggle': json.dumps(data)})
1177 return url + u'#' + sdata
1180 def unsmuggle_url(smug_url, default=None):
1181 if not '#__youtubedl_smuggle' in smug_url:
1182 return smug_url, default
1183 url, _, sdata = smug_url.rpartition(u'#')
1184 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1185 data = json.loads(jsond)
1189 def format_bytes(bytes):
1192 if type(bytes) is str:
1193 bytes = float(bytes)
1197 exponent = int(math.log(bytes, 1024.0))
1198 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1199 converted = float(bytes) / float(1024 ** exponent)
1200 return u'%.2f%s' % (converted, suffix)
1203 def get_term_width():
1204 columns = os.environ.get('COLUMNS', None)
1209 sp = subprocess.Popen(
1211 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1212 out, err = sp.communicate()
1213 return int(out.split()[1])
1219 def month_by_name(name):
1220 """ Return the number of a month by (locale-independently) English name """
1223 u'January', u'February', u'March', u'April', u'May', u'June',
1224 u'July', u'August', u'September', u'October', u'November', u'December']
1226 return ENGLISH_NAMES.index(name) + 1
1231 def fix_xml_ampersands(xml_str):
1232 """Replace all the '&' by '&' in XML"""
1234 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1239 def setproctitle(title):
1240 assert isinstance(title, compat_str)
1242 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1245 title_bytes = title.encode('utf-8')
1246 buf = ctypes.create_string_buffer(len(title_bytes))
1247 buf.value = title_bytes
1249 libc.prctl(15, buf, 0, 0, 0)
1250 except AttributeError:
1251 return # Strange libc, just skip this
1254 def remove_start(s, start):
1255 if s.startswith(start):
1256 return s[len(start):]
1260 def url_basename(url):
1261 path = compat_urlparse.urlparse(url).path
1262 return path.strip(u'/').split(u'/')[-1]
1265 class HEADRequest(compat_urllib_request.Request):
1266 def get_method(self):
1270 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1273 v = getattr(v, get_attr, None)
1274 return default if v is None else (int(v) * invscale // scale)
1277 def str_to_int(int_str):
1280 int_str = re.sub(r'[,\.]', u'', int_str)
1284 def float_or_none(v, scale=1, invscale=1, default=None):
1285 return default if v is None else (float(v) * invscale / scale)
1288 def parse_duration(s):
1293 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1296 res = int(m.group('secs'))
1298 res += int(m.group('mins')) * 60
1299 if m.group('hours'):
1300 res += int(m.group('hours')) * 60 * 60
1304 def prepend_extension(filename, ext):
1305 name, real_ext = os.path.splitext(filename)
1306 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1309 def check_executable(exe, args=[]):
1310 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1311 args can be a list of arguments for a short output (like -version) """
1313 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1319 class PagedList(object):
1320 def __init__(self, pagefunc, pagesize):
1321 self._pagefunc = pagefunc
1322 self._pagesize = pagesize
1325 # This is only useful for tests
1326 return len(self.getslice())
1328 def getslice(self, start=0, end=None):
1330 for pagenum in itertools.count(start // self._pagesize):
1331 firstid = pagenum * self._pagesize
1332 nextfirstid = pagenum * self._pagesize + self._pagesize
1333 if start >= nextfirstid:
1336 page_results = list(self._pagefunc(pagenum))
1339 start % self._pagesize
1340 if firstid <= start < nextfirstid
1344 ((end - 1) % self._pagesize) + 1
1345 if (end is not None and firstid <= end <= nextfirstid)
1348 if startv != 0 or endv is not None:
1349 page_results = page_results[startv:endv]
1350 res.extend(page_results)
1352 # A little optimization - if current page is not "full", ie. does
1353 # not contain page_size videos then we can assume that this page
1354 # is the last one - there are no more ids on further pages -
1355 # i.e. no need to query again.
1356 if len(page_results) + startv < self._pagesize:
1359 # If we got the whole page, but the next page is not interesting,
1360 # break out early as well
1361 if end == nextfirstid:
1366 def uppercase_escape(s):
1367 unicode_escape = codecs.getdecoder('unicode_escape')
1369 r'\\U[0-9a-fA-F]{8}',
1370 lambda m: unicode_escape(m.group(0))[0],
1374 struct.pack(u'!I', 0)
1376 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1377 def struct_pack(spec, *args):
1378 if isinstance(spec, compat_str):
1379 spec = spec.encode('ascii')
1380 return struct.pack(spec, *args)
1382 def struct_unpack(spec, *args):
1383 if isinstance(spec, compat_str):
1384 spec = spec.encode('ascii')
1385 return struct.unpack(spec, *args)
1387 struct_pack = struct.pack
1388 struct_unpack = struct.unpack
1391 def read_batch_urls(batch_fd):
1393 if not isinstance(url, compat_str):
1394 url = url.decode('utf-8', 'replace')
1395 BOM_UTF8 = u'\xef\xbb\xbf'
1396 if url.startswith(BOM_UTF8):
1397 url = url[len(BOM_UTF8):]
1399 if url.startswith(('#', ';', ']')):
1403 with contextlib.closing(batch_fd) as fd:
1404 return [url for url in map(fixup, fd) if url]
1407 def urlencode_postdata(*args, **kargs):
1408 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1412 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1413 def doctype(self, name, pubid, system):
1414 pass # Ignore doctypes
1416 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1417 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1418 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1421 if sys.version_info < (3, 0) and sys.platform == 'win32':
1422 def compat_getpass(prompt, *args, **kwargs):
1423 if isinstance(prompt, compat_str):
1424 prompt = prompt.encode(preferredencoding())
1425 return getpass.getpass(prompt, *args, **kwargs)
1427 compat_getpass = getpass.getpass
1439 def strip_jsonp(code):
1440 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1443 def qualities(quality_ids):
1444 """ Get a numeric quality value out of a list of possible values """
1447 return quality_ids.index(qid)
1453 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1456 subprocess_check_output = subprocess.check_output
1457 except AttributeError:
1458 def subprocess_check_output(*args, **kwargs):
1459 assert 'input' not in kwargs
1460 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1461 output, _ = p.communicate()
1464 raise subprocess.CalledProcessError(ret, p.args, output=output)