2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
58 def preferredencoding():
59 """Get preferred encoding.
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
65 pref = locale.getpreferredencoding()
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
78 'prefix': os.path.basename(fn) + '.',
79 'dir': os.path.dirname(fn),
83 # In Python 2.x, json.dump expects a bytestream.
84 # In Python 3.x, it writes to a character stream
85 if sys.version_info < (3, 0):
93 tf = tempfile.NamedTemporaryFile(**args)
98 os.rename(tf.name, fn)
107 if sys.version_info >= (2, 7):
108 def find_xpath_attr(node, xpath, key, val):
109 """ Find the xpath xpath[@key=val] """
110 assert re.match(r'^[a-zA-Z-]+$', key)
111 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
112 expr = xpath + u"[@%s='%s']" % (key, val)
113 return node.find(expr)
115 def find_xpath_attr(node, xpath, key, val):
116 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
117 # .//node does not match if a node is a direct child of . !
118 if isinstance(xpath, unicode):
119 xpath = xpath.encode('ascii')
121 for f in node.findall(xpath):
122 if f.attrib.get(key) == val:
126 # On python2.6 the xml.etree.ElementTree.Element methods don't support
127 # the namespace parameter
128 def xpath_with_ns(path, ns_map):
129 components = [c.split(':') for c in path.split('/')]
133 replaced.append(c[0])
136 replaced.append('{%s}%s' % (ns_map[ns], tag))
137 return '/'.join(replaced)
140 def xpath_text(node, xpath, name=None, fatal=False):
141 if sys.version_info < (2, 7): # Crazy 2.6
142 xpath = xpath.encode('ascii')
147 name = xpath if name is None else name
148 raise ExtractorError('Could not find XML element %s' % name)
154 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
155 class BaseHTMLParser(compat_html_parser.HTMLParser):
157 compat_html_parser.HTMLParser.__init__(self)
160 def loads(self, html):
165 class AttrParser(BaseHTMLParser):
166 """Modified HTMLParser that isolates a tag with the specified attribute"""
167 def __init__(self, attribute, value):
168 self.attribute = attribute
173 self.watch_startpos = False
175 BaseHTMLParser.__init__(self)
177 def error(self, message):
178 if self.error_count > 10 or self.started:
179 raise compat_html_parser.HTMLParseError(message, self.getpos())
180 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
181 self.error_count += 1
184 def handle_starttag(self, tag, attrs):
187 self.find_startpos(None)
188 if self.attribute in attrs and attrs[self.attribute] == self.value:
191 self.watch_startpos = True
193 if not tag in self.depth: self.depth[tag] = 0
196 def handle_endtag(self, tag):
198 if tag in self.depth: self.depth[tag] -= 1
199 if self.depth[self.result[0]] == 0:
201 self.result.append(self.getpos())
203 def find_startpos(self, x):
204 """Needed to put the start position of the result (self.result[1])
205 after the opening tag with the requested id"""
206 if self.watch_startpos:
207 self.watch_startpos = False
208 self.result.append(self.getpos())
209 handle_entityref = handle_charref = handle_data = handle_comment = \
210 handle_decl = handle_pi = unknown_decl = find_startpos
212 def get_result(self):
213 if self.result is None:
215 if len(self.result) != 3:
217 lines = self.html.split('\n')
218 lines = lines[self.result[1][0]-1:self.result[2][0]]
219 lines[0] = lines[0][self.result[1][1]:]
221 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
222 lines[-1] = lines[-1][:self.result[2][1]]
223 return '\n'.join(lines).strip()
224 # Hack for https://github.com/rg3/youtube-dl/issues/662
225 if sys.version_info < (2, 7, 3):
226 AttrParser.parse_endtag = (lambda self, i:
227 i + len("</scr'+'ipt>")
228 if self.rawdata[i:].startswith("</scr'+'ipt>")
229 else compat_html_parser.HTMLParser.parse_endtag(self, i))
231 def get_element_by_id(id, html):
232 """Return the content of the tag with the specified ID in the passed HTML document"""
233 return get_element_by_attribute("id", id, html)
235 def get_element_by_attribute(attribute, value, html):
236 """Return the content of the tag with the specified attribute in the passed HTML document"""
237 parser = AttrParser(attribute, value)
240 except compat_html_parser.HTMLParseError:
242 return parser.get_result()
244 class MetaParser(BaseHTMLParser):
246 Modified HTMLParser that isolates a meta tag with the specified name
249 def __init__(self, name):
250 BaseHTMLParser.__init__(self)
255 def handle_starttag(self, tag, attrs):
259 if attrs.get('name') == self.name:
260 self.result = attrs.get('content')
262 def get_result(self):
265 def get_meta_content(name, html):
267 Return the content attribute from the meta tag with the given name attribute.
269 parser = MetaParser(name)
272 except compat_html_parser.HTMLParseError:
274 return parser.get_result()
277 def clean_html(html):
278 """Clean an HTML snippet into a readable string"""
280 html = html.replace('\n', ' ')
281 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
282 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
284 html = re.sub('<.*?>', '', html)
285 # Replace html entities
286 html = unescapeHTML(html)
290 def sanitize_open(filename, open_mode):
291 """Try to open the given filename, and slightly tweak it if this fails.
293 Attempts to open the given filename. If this fails, it tries to change
294 the filename slightly, step by step, until it's either able to open it
295 or it fails and raises a final exception, like the standard open()
298 It returns the tuple (stream, definitive_file_name).
302 if sys.platform == 'win32':
304 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
305 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
306 stream = open(encodeFilename(filename), open_mode)
307 return (stream, filename)
308 except (IOError, OSError) as err:
309 if err.errno in (errno.EACCES,):
312 # In case of error, try to remove win32 forbidden chars
313 alt_filename = os.path.join(
314 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
315 for path_part in os.path.split(filename)
317 if alt_filename == filename:
320 # An exception here should be caught in the caller
321 stream = open(encodeFilename(filename), open_mode)
322 return (stream, alt_filename)
325 def timeconvert(timestr):
326 """Convert RFC 2822 defined time string into system timestamp"""
328 timetuple = email.utils.parsedate_tz(timestr)
329 if timetuple is not None:
330 timestamp = email.utils.mktime_tz(timetuple)
333 def sanitize_filename(s, restricted=False, is_id=False):
334 """Sanitizes a string so it could be used as part of a filename.
335 If restricted is set, use a stricter subset of allowed characters.
336 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
338 def replace_insane(char):
339 if char == '?' or ord(char) < 32 or ord(char) == 127:
342 return '' if restricted else '\''
344 return '_-' if restricted else ' -'
345 elif char in '\\/|*<>':
347 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
349 if restricted and ord(char) > 127:
353 result = u''.join(map(replace_insane, s))
355 while '__' in result:
356 result = result.replace('__', '_')
357 result = result.strip('_')
358 # Common case of "Foreign band name - English song title"
359 if restricted and result.startswith('-_'):
365 def orderedSet(iterable):
366 """ Remove all duplicates from the input iterable """
374 def _htmlentity_transform(entity):
375 """Transforms an HTML entity to a character."""
376 # Known non-numeric HTML entity
377 if entity in compat_html_entities.name2codepoint:
378 return compat_chr(compat_html_entities.name2codepoint[entity])
380 mobj = re.match(r'#(x?[0-9]+)', entity)
382 numstr = mobj.group(1)
383 if numstr.startswith(u'x'):
385 numstr = u'0%s' % numstr
388 return compat_chr(int(numstr, base))
390 # Unknown entity in name, return its literal representation
391 return (u'&%s;' % entity)
397 assert type(s) == compat_str
400 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
403 def encodeFilename(s, for_subprocess=False):
405 @param s The name of the file
408 assert type(s) == compat_str
410 # Python 3 has a Unicode API
411 if sys.version_info >= (3, 0):
414 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
415 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
416 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
417 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
418 if not for_subprocess:
421 # For subprocess calls, encode with locale encoding
422 # Refer to http://stackoverflow.com/a/9951851/35070
423 encoding = preferredencoding()
425 encoding = sys.getfilesystemencoding()
428 return s.encode(encoding, 'ignore')
431 def encodeArgument(s):
432 if not isinstance(s, compat_str):
433 # Legacy code that uses byte strings
434 # Uncomment the following line after fixing all post processors
435 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
436 s = s.decode('ascii')
437 return encodeFilename(s, True)
440 def decodeOption(optval):
443 if isinstance(optval, bytes):
444 optval = optval.decode(preferredencoding())
446 assert isinstance(optval, compat_str)
449 def formatSeconds(secs):
451 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
453 return '%d:%02d' % (secs // 60, secs % 60)
458 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
459 if sys.version_info < (3, 2):
462 class HTTPSConnectionV3(httplib.HTTPSConnection):
463 def __init__(self, *args, **kwargs):
464 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
467 sock = socket.create_connection((self.host, self.port), self.timeout)
468 if getattr(self, '_tunnel_host', False):
472 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
474 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
476 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
477 def https_open(self, req):
478 return self.do_open(HTTPSConnectionV3, req)
479 return HTTPSHandlerV3(**kwargs)
480 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
481 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
482 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
483 if opts_no_check_certificate:
484 context.verify_mode = ssl.CERT_NONE
485 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
487 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
488 context.verify_mode = (ssl.CERT_NONE
489 if opts_no_check_certificate
490 else ssl.CERT_REQUIRED)
491 context.set_default_verify_paths()
493 context.load_default_certs()
494 except AttributeError:
496 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
498 class ExtractorError(Exception):
499 """Error during info extraction."""
500 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
501 """ tb, if given, is the original traceback (so that it can be printed out).
502 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
505 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
507 if video_id is not None:
508 msg = video_id + ': ' + msg
510 msg += u' (caused by %r)' % cause
512 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
513 super(ExtractorError, self).__init__(msg)
516 self.exc_info = sys.exc_info() # preserve original exception
518 self.video_id = video_id
520 def format_traceback(self):
521 if self.traceback is None:
523 return u''.join(traceback.format_tb(self.traceback))
526 class RegexNotFoundError(ExtractorError):
527 """Error when a regex didn't match"""
531 class DownloadError(Exception):
532 """Download Error exception.
534 This exception may be thrown by FileDownloader objects if they are not
535 configured to continue on errors. They will contain the appropriate
538 def __init__(self, msg, exc_info=None):
539 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
540 super(DownloadError, self).__init__(msg)
541 self.exc_info = exc_info
544 class SameFileError(Exception):
545 """Same File exception.
547 This exception will be thrown by FileDownloader objects if they detect
548 multiple files would have to be downloaded to the same file on disk.
553 class PostProcessingError(Exception):
554 """Post Processing exception.
556 This exception may be raised by PostProcessor's .run() method to
557 indicate an error in the postprocessing task.
559 def __init__(self, msg):
562 class MaxDownloadsReached(Exception):
563 """ --max-downloads limit has been reached. """
567 class UnavailableVideoError(Exception):
568 """Unavailable Format exception.
570 This exception will be thrown when a video is requested
571 in a format that is not available for that video.
576 class ContentTooShortError(Exception):
577 """Content Too Short exception.
579 This exception may be raised by FileDownloader objects when a file they
580 download is too small for what the server announced first, indicating
581 the connection was probably interrupted.
587 def __init__(self, downloaded, expected):
588 self.downloaded = downloaded
589 self.expected = expected
591 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
592 """Handler for HTTP requests and responses.
594 This class, when installed with an OpenerDirector, automatically adds
595 the standard headers to every HTTP request and handles gzipped and
596 deflated responses from web servers. If compression is to be avoided in
597 a particular request, the original request in the program code only has
598 to include the HTTP header "Youtubedl-No-Compression", which will be
599 removed before making the real request.
601 Part of this code was copied from:
603 http://techknack.net/python-urllib2-handlers/
605 Andrew Rowls, the author of that code, agreed to release it to the
612 return zlib.decompress(data, -zlib.MAX_WBITS)
614 return zlib.decompress(data)
617 def addinfourl_wrapper(stream, headers, url, code):
618 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
619 return compat_urllib_request.addinfourl(stream, headers, url, code)
620 ret = compat_urllib_request.addinfourl(stream, headers, url)
624 def http_request(self, req):
625 for h, v in std_headers.items():
626 if h not in req.headers:
628 if 'Youtubedl-no-compression' in req.headers:
629 if 'Accept-encoding' in req.headers:
630 del req.headers['Accept-encoding']
631 del req.headers['Youtubedl-no-compression']
632 if 'Youtubedl-user-agent' in req.headers:
633 if 'User-agent' in req.headers:
634 del req.headers['User-agent']
635 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
636 del req.headers['Youtubedl-user-agent']
638 if sys.version_info < (2, 7) and '#' in req.get_full_url():
639 # Python 2.6 is brain-dead when it comes to fragments
640 req._Request__original = req._Request__original.partition('#')[0]
641 req._Request__r_type = req._Request__r_type.partition('#')[0]
645 def http_response(self, req, resp):
648 if resp.headers.get('Content-encoding', '') == 'gzip':
649 content = resp.read()
650 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
652 uncompressed = io.BytesIO(gz.read())
653 except IOError as original_ioerror:
654 # There may be junk add the end of the file
655 # See http://stackoverflow.com/q/4928560/35070 for details
656 for i in range(1, 1024):
658 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
659 uncompressed = io.BytesIO(gz.read())
664 raise original_ioerror
665 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
666 resp.msg = old_resp.msg
668 if resp.headers.get('Content-encoding', '') == 'deflate':
669 gz = io.BytesIO(self.deflate(resp.read()))
670 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
671 resp.msg = old_resp.msg
674 https_request = http_request
675 https_response = http_response
678 def parse_iso8601(date_str, delimiter='T'):
679 """ Return a UNIX timestamp from the given date """
685 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
688 timezone = datetime.timedelta()
690 date_str = date_str[:-len(m.group(0))]
691 if not m.group('sign'):
692 timezone = datetime.timedelta()
694 sign = 1 if m.group('sign') == '+' else -1
695 timezone = datetime.timedelta(
696 hours=sign * int(m.group('hours')),
697 minutes=sign * int(m.group('minutes')))
698 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
699 dt = datetime.datetime.strptime(date_str, date_format) - timezone
700 return calendar.timegm(dt.timetuple())
703 def unified_strdate(date_str):
704 """Return a string with the date in the format YYYYMMDD"""
711 date_str = date_str.replace(',', ' ')
712 # %z (UTC offset) is only supported in python>=3.2
713 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
714 format_expressions = [
719 '%b %dst %Y %I:%M%p',
720 '%b %dnd %Y %I:%M%p',
721 '%b %dth %Y %I:%M%p',
730 '%Y-%m-%d %H:%M:%S.%f',
733 '%Y-%m-%dT%H:%M:%SZ',
734 '%Y-%m-%dT%H:%M:%S.%fZ',
735 '%Y-%m-%dT%H:%M:%S.%f0Z',
737 '%Y-%m-%dT%H:%M:%S.%f',
740 for expression in format_expressions:
742 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
745 if upload_date is None:
746 timetuple = email.utils.parsedate_tz(date_str)
748 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
751 def determine_ext(url, default_ext=u'unknown_video'):
754 guess = url.partition(u'?')[0].rpartition(u'.')[2]
755 if re.match(r'^[A-Za-z0-9]+$', guess):
760 def subtitles_filename(filename, sub_lang, sub_format):
761 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
763 def date_from_str(date_str):
765 Return a datetime object from a string in the format YYYYMMDD or
766 (now|today)[+-][0-9](day|week|month|year)(s)?"""
767 today = datetime.date.today()
768 if date_str == 'now'or date_str == 'today':
770 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
771 if match is not None:
772 sign = match.group('sign')
773 time = int(match.group('time'))
776 unit = match.group('unit')
785 delta = datetime.timedelta(**{unit: time})
787 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
789 def hyphenate_date(date_str):
791 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
792 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
793 if match is not None:
794 return '-'.join(match.groups())
798 class DateRange(object):
799 """Represents a time interval between two dates"""
800 def __init__(self, start=None, end=None):
801 """start and end must be strings in the format accepted by date"""
802 if start is not None:
803 self.start = date_from_str(start)
805 self.start = datetime.datetime.min.date()
807 self.end = date_from_str(end)
809 self.end = datetime.datetime.max.date()
810 if self.start > self.end:
811 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
814 """Returns a range that only contains the given day"""
816 def __contains__(self, date):
817 """Check if the date is in the range"""
818 if not isinstance(date, datetime.date):
819 date = date_from_str(date)
820 return self.start <= date <= self.end
822 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
826 """ Returns the platform name as a compat_str """
827 res = platform.platform()
828 if isinstance(res, bytes):
829 res = res.decode(preferredencoding())
831 assert isinstance(res, compat_str)
835 def _windows_write_string(s, out):
836 """ Returns True if the string was written using special methods,
837 False if it has yet to be written out."""
838 # Adapted from http://stackoverflow.com/a/3259271/35070
841 import ctypes.wintypes
849 fileno = out.fileno()
850 except AttributeError:
851 # If the output stream doesn't have a fileno, it's virtual
853 if fileno not in WIN_OUTPUT_IDS:
856 GetStdHandle = ctypes.WINFUNCTYPE(
857 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
858 ("GetStdHandle", ctypes.windll.kernel32))
859 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
861 WriteConsoleW = ctypes.WINFUNCTYPE(
862 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
863 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
864 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
865 written = ctypes.wintypes.DWORD(0)
867 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
868 FILE_TYPE_CHAR = 0x0002
869 FILE_TYPE_REMOTE = 0x8000
870 GetConsoleMode = ctypes.WINFUNCTYPE(
871 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
872 ctypes.POINTER(ctypes.wintypes.DWORD))(
873 ("GetConsoleMode", ctypes.windll.kernel32))
874 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
876 def not_a_console(handle):
877 if handle == INVALID_HANDLE_VALUE or handle is None:
879 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
880 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
885 def next_nonbmp_pos(s):
887 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
888 except StopIteration:
892 count = min(next_nonbmp_pos(s), 1024)
895 h, s, count if count else 2, ctypes.byref(written), None)
897 raise OSError('Failed to write string')
898 if not count: # We just wrote a non-BMP character
899 assert written.value == 2
902 assert written.value > 0
903 s = s[written.value:]
907 def write_string(s, out=None, encoding=None):
910 assert type(s) == compat_str
912 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
913 if _windows_write_string(s, out):
916 if ('b' in getattr(out, 'mode', '') or
917 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
918 byt = s.encode(encoding or preferredencoding(), 'ignore')
920 elif hasattr(out, 'buffer'):
921 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
922 byt = s.encode(enc, 'ignore')
923 out.buffer.write(byt)
929 def bytes_to_intlist(bs):
932 if isinstance(bs[0], int): # Python 3
935 return [ord(c) for c in bs]
938 def intlist_to_bytes(xs):
941 if isinstance(chr(0), bytes): # Python 2
942 return ''.join([chr(x) for x in xs])
947 # Cross-platform file locking
948 if sys.platform == 'win32':
949 import ctypes.wintypes
952 class OVERLAPPED(ctypes.Structure):
954 ('Internal', ctypes.wintypes.LPVOID),
955 ('InternalHigh', ctypes.wintypes.LPVOID),
956 ('Offset', ctypes.wintypes.DWORD),
957 ('OffsetHigh', ctypes.wintypes.DWORD),
958 ('hEvent', ctypes.wintypes.HANDLE),
961 kernel32 = ctypes.windll.kernel32
962 LockFileEx = kernel32.LockFileEx
963 LockFileEx.argtypes = [
964 ctypes.wintypes.HANDLE, # hFile
965 ctypes.wintypes.DWORD, # dwFlags
966 ctypes.wintypes.DWORD, # dwReserved
967 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
968 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
969 ctypes.POINTER(OVERLAPPED) # Overlapped
971 LockFileEx.restype = ctypes.wintypes.BOOL
972 UnlockFileEx = kernel32.UnlockFileEx
973 UnlockFileEx.argtypes = [
974 ctypes.wintypes.HANDLE, # hFile
975 ctypes.wintypes.DWORD, # dwReserved
976 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
977 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
978 ctypes.POINTER(OVERLAPPED) # Overlapped
980 UnlockFileEx.restype = ctypes.wintypes.BOOL
981 whole_low = 0xffffffff
982 whole_high = 0x7fffffff
984 def _lock_file(f, exclusive):
985 overlapped = OVERLAPPED()
986 overlapped.Offset = 0
987 overlapped.OffsetHigh = 0
988 overlapped.hEvent = 0
989 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
990 handle = msvcrt.get_osfhandle(f.fileno())
991 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
992 whole_low, whole_high, f._lock_file_overlapped_p):
993 raise OSError('Locking file failed: %r' % ctypes.FormatError())
996 assert f._lock_file_overlapped_p
997 handle = msvcrt.get_osfhandle(f.fileno())
998 if not UnlockFileEx(handle, 0,
999 whole_low, whole_high, f._lock_file_overlapped_p):
1000 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1005 def _lock_file(f, exclusive):
1006 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1008 def _unlock_file(f):
1009 fcntl.flock(f, fcntl.LOCK_UN)
1012 class locked_file(object):
1013 def __init__(self, filename, mode, encoding=None):
1014 assert mode in ['r', 'a', 'w']
1015 self.f = io.open(filename, mode, encoding=encoding)
1018 def __enter__(self):
1019 exclusive = self.mode != 'r'
1021 _lock_file(self.f, exclusive)
1027 def __exit__(self, etype, value, traceback):
1029 _unlock_file(self.f)
1036 def write(self, *args):
1037 return self.f.write(*args)
1039 def read(self, *args):
1040 return self.f.read(*args)
1043 def get_filesystem_encoding():
1044 encoding = sys.getfilesystemencoding()
1045 return encoding if encoding is not None else 'utf-8'
1048 def shell_quote(args):
1050 encoding = get_filesystem_encoding()
1052 if isinstance(a, bytes):
1053 # We may get a filename encoded with 'encodeFilename'
1054 a = a.decode(encoding)
1055 quoted_args.append(pipes.quote(a))
1056 return u' '.join(quoted_args)
1059 def takewhile_inclusive(pred, seq):
1060 """ Like itertools.takewhile, but include the latest evaluated element
1061 (the first element so that Not pred(e)) """
1068 def smuggle_url(url, data):
1069 """ Pass additional data in a URL for internal use. """
1071 sdata = compat_urllib_parse.urlencode(
1072 {u'__youtubedl_smuggle': json.dumps(data)})
1073 return url + u'#' + sdata
1076 def unsmuggle_url(smug_url, default=None):
1077 if not '#__youtubedl_smuggle' in smug_url:
1078 return smug_url, default
1079 url, _, sdata = smug_url.rpartition(u'#')
1080 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1081 data = json.loads(jsond)
1085 def format_bytes(bytes):
1088 if type(bytes) is str:
1089 bytes = float(bytes)
1093 exponent = int(math.log(bytes, 1024.0))
1094 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1095 converted = float(bytes) / float(1024 ** exponent)
1096 return u'%.2f%s' % (converted, suffix)
1099 def get_term_width():
1100 columns = compat_getenv('COLUMNS', None)
1105 sp = subprocess.Popen(
1107 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1108 out, err = sp.communicate()
1109 return int(out.split()[1])
1115 def month_by_name(name):
1116 """ Return the number of a month by (locale-independently) English name """
1119 u'January', u'February', u'March', u'April', u'May', u'June',
1120 u'July', u'August', u'September', u'October', u'November', u'December']
1122 return ENGLISH_NAMES.index(name) + 1
1127 def fix_xml_ampersands(xml_str):
1128 """Replace all the '&' by '&' in XML"""
1130 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1135 def setproctitle(title):
1136 assert isinstance(title, compat_str)
1138 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1141 title_bytes = title.encode('utf-8')
1142 buf = ctypes.create_string_buffer(len(title_bytes))
1143 buf.value = title_bytes
1145 libc.prctl(15, buf, 0, 0, 0)
1146 except AttributeError:
1147 return # Strange libc, just skip this
1150 def remove_start(s, start):
1151 if s.startswith(start):
1152 return s[len(start):]
1156 def remove_end(s, end):
1158 return s[:-len(end)]
1162 def url_basename(url):
1163 path = compat_urlparse.urlparse(url).path
1164 return path.strip(u'/').split(u'/')[-1]
1167 class HEADRequest(compat_urllib_request.Request):
1168 def get_method(self):
1172 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1175 v = getattr(v, get_attr, None)
1178 return default if v is None else (int(v) * invscale // scale)
1181 def str_or_none(v, default=None):
1182 return default if v is None else compat_str(v)
1185 def str_to_int(int_str):
1186 """ A more relaxed version of int_or_none """
1189 int_str = re.sub(r'[,\.\+]', u'', int_str)
1193 def float_or_none(v, scale=1, invscale=1, default=None):
1194 return default if v is None else (float(v) * invscale / scale)
1197 def parse_duration(s):
1204 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1207 res = int(m.group('secs'))
1209 res += int(m.group('mins')) * 60
1210 if m.group('hours'):
1211 res += int(m.group('hours')) * 60 * 60
1213 res += float(m.group('ms'))
1217 def prepend_extension(filename, ext):
1218 name, real_ext = os.path.splitext(filename)
1219 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1222 def check_executable(exe, args=[]):
1223 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1224 args can be a list of arguments for a short output (like -version) """
1226 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1232 def get_exe_version(exe, args=['--version'],
1233 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1234 unrecognized=u'present'):
1235 """ Returns the version of the specified executable,
1236 or False if the executable is not present """
1238 out, err = subprocess.Popen(
1240 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1243 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1244 m = re.search(version_re, firstline)
1251 class PagedList(object):
1253 # This is only useful for tests
1254 return len(self.getslice())
1257 class OnDemandPagedList(PagedList):
1258 def __init__(self, pagefunc, pagesize):
1259 self._pagefunc = pagefunc
1260 self._pagesize = pagesize
1262 def getslice(self, start=0, end=None):
1264 for pagenum in itertools.count(start // self._pagesize):
1265 firstid = pagenum * self._pagesize
1266 nextfirstid = pagenum * self._pagesize + self._pagesize
1267 if start >= nextfirstid:
1270 page_results = list(self._pagefunc(pagenum))
1273 start % self._pagesize
1274 if firstid <= start < nextfirstid
1278 ((end - 1) % self._pagesize) + 1
1279 if (end is not None and firstid <= end <= nextfirstid)
1282 if startv != 0 or endv is not None:
1283 page_results = page_results[startv:endv]
1284 res.extend(page_results)
1286 # A little optimization - if current page is not "full", ie. does
1287 # not contain page_size videos then we can assume that this page
1288 # is the last one - there are no more ids on further pages -
1289 # i.e. no need to query again.
1290 if len(page_results) + startv < self._pagesize:
1293 # If we got the whole page, but the next page is not interesting,
1294 # break out early as well
1295 if end == nextfirstid:
1300 class InAdvancePagedList(PagedList):
1301 def __init__(self, pagefunc, pagecount, pagesize):
1302 self._pagefunc = pagefunc
1303 self._pagecount = pagecount
1304 self._pagesize = pagesize
1306 def getslice(self, start=0, end=None):
1308 start_page = start // self._pagesize
1310 self._pagecount if end is None else (end // self._pagesize + 1))
1311 skip_elems = start - start_page * self._pagesize
1312 only_more = None if end is None else end - start
1313 for pagenum in range(start_page, end_page):
1314 page = list(self._pagefunc(pagenum))
1316 page = page[skip_elems:]
1318 if only_more is not None:
1319 if len(page) < only_more:
1320 only_more -= len(page)
1322 page = page[:only_more]
1329 def uppercase_escape(s):
1330 unicode_escape = codecs.getdecoder('unicode_escape')
1332 r'\\U[0-9a-fA-F]{8}',
1333 lambda m: unicode_escape(m.group(0))[0],
1337 def escape_rfc3986(s):
1338 """Escape non-ASCII characters as suggested by RFC 3986"""
1339 if sys.version_info < (3, 0) and isinstance(s, unicode):
1340 s = s.encode('utf-8')
1341 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1344 def escape_url(url):
1345 """Escape URL as suggested by RFC 3986"""
1346 url_parsed = compat_urllib_parse_urlparse(url)
1347 return url_parsed._replace(
1348 path=escape_rfc3986(url_parsed.path),
1349 params=escape_rfc3986(url_parsed.params),
1350 query=escape_rfc3986(url_parsed.query),
1351 fragment=escape_rfc3986(url_parsed.fragment)
1355 struct.pack(u'!I', 0)
1357 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1358 def struct_pack(spec, *args):
1359 if isinstance(spec, compat_str):
1360 spec = spec.encode('ascii')
1361 return struct.pack(spec, *args)
1363 def struct_unpack(spec, *args):
1364 if isinstance(spec, compat_str):
1365 spec = spec.encode('ascii')
1366 return struct.unpack(spec, *args)
1368 struct_pack = struct.pack
1369 struct_unpack = struct.unpack
1372 def read_batch_urls(batch_fd):
1374 if not isinstance(url, compat_str):
1375 url = url.decode('utf-8', 'replace')
1376 BOM_UTF8 = u'\xef\xbb\xbf'
1377 if url.startswith(BOM_UTF8):
1378 url = url[len(BOM_UTF8):]
1380 if url.startswith(('#', ';', ']')):
1384 with contextlib.closing(batch_fd) as fd:
1385 return [url for url in map(fixup, fd) if url]
1388 def urlencode_postdata(*args, **kargs):
1389 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1393 etree_iter = xml.etree.ElementTree.Element.iter
1394 except AttributeError: # Python <=2.6
1395 etree_iter = lambda n: n.findall('.//*')
1399 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1400 def doctype(self, name, pubid, system):
1401 pass # Ignore doctypes
1403 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1404 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1405 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1406 # Fix up XML parser in Python 2.x
1407 if sys.version_info < (3, 0):
1408 for n in etree_iter(tree):
1409 if n.text is not None:
1410 if not isinstance(n.text, compat_str):
1411 n.text = n.text.decode('utf-8')
1424 def parse_age_limit(s):
1427 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1428 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1431 def strip_jsonp(code):
1432 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1435 def js_to_json(code):
1438 if v in ('true', 'false', 'null'):
1440 if v.startswith('"'):
1442 if v.startswith("'"):
1444 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1451 res = re.sub(r'''(?x)
1452 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1453 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1454 [a-zA-Z_][a-zA-Z_0-9]*
1456 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1460 def qualities(quality_ids):
1461 """ Get a numeric quality value out of a list of possible values """
1464 return quality_ids.index(qid)
1470 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1473 def limit_length(s, length):
1474 """ Add ellipses to overly long strings """
1479 return s[:length - len(ELLIPSES)] + ELLIPSES
1483 def version_tuple(v):
1484 return [int(e) for e in v.split('.')]
1487 def is_outdated_version(version, limit, assume_new=True):
1489 return not assume_new
1491 return version_tuple(version) < version_tuple(limit)
1493 return not assume_new