2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
42 compat_urllib_parse_urlparse,
43 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
59 def preferredencoding():
60 """Get preferred encoding.
62 Returns the best encoding scheme for the system, based on
63 locale.getpreferredencoding() and some further tweaks.
66 pref = locale.getpreferredencoding()
74 def write_json_file(obj, fn):
75 """ Encode obj as JSON and write it to fn, atomically """
79 'prefix': os.path.basename(fn) + '.',
80 'dir': os.path.dirname(fn),
84 # In Python 2.x, json.dump expects a bytestream.
85 # In Python 3.x, it writes to a character stream
86 if sys.version_info < (3, 0):
94 tf = tempfile.NamedTemporaryFile(**args)
99 os.rename(tf.name, fn)
108 if sys.version_info >= (2, 7):
109 def find_xpath_attr(node, xpath, key, val):
110 """ Find the xpath xpath[@key=val] """
111 assert re.match(r'^[a-zA-Z-]+$', key)
112 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
113 expr = xpath + u"[@%s='%s']" % (key, val)
114 return node.find(expr)
116 def find_xpath_attr(node, xpath, key, val):
117 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
118 # .//node does not match if a node is a direct child of . !
119 if isinstance(xpath, unicode):
120 xpath = xpath.encode('ascii')
122 for f in node.findall(xpath):
123 if f.attrib.get(key) == val:
127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
128 # the namespace parameter
129 def xpath_with_ns(path, ns_map):
130 components = [c.split(':') for c in path.split('/')]
134 replaced.append(c[0])
137 replaced.append('{%s}%s' % (ns_map[ns], tag))
138 return '/'.join(replaced)
141 def xpath_text(node, xpath, name=None, fatal=False):
142 if sys.version_info < (2, 7): # Crazy 2.6
143 xpath = xpath.encode('ascii')
148 name = xpath if name is None else name
149 raise ExtractorError('Could not find XML element %s' % name)
155 if sys.version_info < (2, 7):
156 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
158 class BaseHTMLParser(compat_html_parser.HTMLParser):
160 compat_html_parser.HTMLParser.__init__(self)
163 def loads(self, html):
168 class AttrParser(BaseHTMLParser):
169 """Modified HTMLParser that isolates a tag with the specified attribute"""
170 def __init__(self, attribute, value):
171 self.attribute = attribute
176 self.watch_startpos = False
178 BaseHTMLParser.__init__(self)
180 def error(self, message):
181 if self.error_count > 10 or self.started:
182 raise compat_html_parser.HTMLParseError(message, self.getpos())
183 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
184 self.error_count += 1
187 def handle_starttag(self, tag, attrs):
190 self.find_startpos(None)
191 if self.attribute in attrs and attrs[self.attribute] == self.value:
194 self.watch_startpos = True
196 if not tag in self.depth: self.depth[tag] = 0
199 def handle_endtag(self, tag):
201 if tag in self.depth: self.depth[tag] -= 1
202 if self.depth[self.result[0]] == 0:
204 self.result.append(self.getpos())
206 def find_startpos(self, x):
207 """Needed to put the start position of the result (self.result[1])
208 after the opening tag with the requested id"""
209 if self.watch_startpos:
210 self.watch_startpos = False
211 self.result.append(self.getpos())
212 handle_entityref = handle_charref = handle_data = handle_comment = \
213 handle_decl = handle_pi = unknown_decl = find_startpos
215 def get_result(self):
216 if self.result is None:
218 if len(self.result) != 3:
220 lines = self.html.split('\n')
221 lines = lines[self.result[1][0]-1:self.result[2][0]]
222 lines[0] = lines[0][self.result[1][1]:]
224 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
225 lines[-1] = lines[-1][:self.result[2][1]]
226 return '\n'.join(lines).strip()
227 # Hack for https://github.com/rg3/youtube-dl/issues/662
228 if sys.version_info < (2, 7, 3):
229 AttrParser.parse_endtag = (lambda self, i:
230 i + len("</scr'+'ipt>")
231 if self.rawdata[i:].startswith("</scr'+'ipt>")
232 else compat_html_parser.HTMLParser.parse_endtag(self, i))
235 def get_element_by_id(id, html):
236 """Return the content of the tag with the specified ID in the passed HTML document"""
237 return get_element_by_attribute("id", id, html)
240 def get_element_by_attribute(attribute, value, html):
241 """Return the content of the tag with the specified attribute in the passed HTML document"""
242 parser = AttrParser(attribute, value)
245 except compat_html_parser.HTMLParseError:
247 return parser.get_result()
249 class MetaParser(BaseHTMLParser):
251 Modified HTMLParser that isolates a meta tag with the specified name
254 def __init__(self, name):
255 BaseHTMLParser.__init__(self)
260 def handle_starttag(self, tag, attrs):
264 if attrs.get('name') == self.name:
265 self.result = attrs.get('content')
267 def get_result(self):
272 def clean_html(html):
273 """Clean an HTML snippet into a readable string"""
275 html = html.replace('\n', ' ')
276 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
277 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
279 html = re.sub('<.*?>', '', html)
280 # Replace html entities
281 html = unescapeHTML(html)
285 def sanitize_open(filename, open_mode):
286 """Try to open the given filename, and slightly tweak it if this fails.
288 Attempts to open the given filename. If this fails, it tries to change
289 the filename slightly, step by step, until it's either able to open it
290 or it fails and raises a final exception, like the standard open()
293 It returns the tuple (stream, definitive_file_name).
297 if sys.platform == 'win32':
299 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
300 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
301 stream = open(encodeFilename(filename), open_mode)
302 return (stream, filename)
303 except (IOError, OSError) as err:
304 if err.errno in (errno.EACCES,):
307 # In case of error, try to remove win32 forbidden chars
308 alt_filename = os.path.join(
309 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
310 for path_part in os.path.split(filename)
312 if alt_filename == filename:
315 # An exception here should be caught in the caller
316 stream = open(encodeFilename(filename), open_mode)
317 return (stream, alt_filename)
320 def timeconvert(timestr):
321 """Convert RFC 2822 defined time string into system timestamp"""
323 timetuple = email.utils.parsedate_tz(timestr)
324 if timetuple is not None:
325 timestamp = email.utils.mktime_tz(timetuple)
328 def sanitize_filename(s, restricted=False, is_id=False):
329 """Sanitizes a string so it could be used as part of a filename.
330 If restricted is set, use a stricter subset of allowed characters.
331 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
333 def replace_insane(char):
334 if char == '?' or ord(char) < 32 or ord(char) == 127:
337 return '' if restricted else '\''
339 return '_-' if restricted else ' -'
340 elif char in '\\/|*<>':
342 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
344 if restricted and ord(char) > 127:
348 result = u''.join(map(replace_insane, s))
350 while '__' in result:
351 result = result.replace('__', '_')
352 result = result.strip('_')
353 # Common case of "Foreign band name - English song title"
354 if restricted and result.startswith('-_'):
360 def orderedSet(iterable):
361 """ Remove all duplicates from the input iterable """
369 def _htmlentity_transform(entity):
370 """Transforms an HTML entity to a character."""
371 # Known non-numeric HTML entity
372 if entity in compat_html_entities.name2codepoint:
373 return compat_chr(compat_html_entities.name2codepoint[entity])
375 mobj = re.match(r'#(x?[0-9]+)', entity)
377 numstr = mobj.group(1)
378 if numstr.startswith(u'x'):
380 numstr = u'0%s' % numstr
383 return compat_chr(int(numstr, base))
385 # Unknown entity in name, return its literal representation
386 return (u'&%s;' % entity)
392 assert type(s) == compat_str
395 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
398 def encodeFilename(s, for_subprocess=False):
400 @param s The name of the file
403 assert type(s) == compat_str
405 # Python 3 has a Unicode API
406 if sys.version_info >= (3, 0):
409 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
410 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
411 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
412 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
413 if not for_subprocess:
416 # For subprocess calls, encode with locale encoding
417 # Refer to http://stackoverflow.com/a/9951851/35070
418 encoding = preferredencoding()
420 encoding = sys.getfilesystemencoding()
423 return s.encode(encoding, 'ignore')
426 def encodeArgument(s):
427 if not isinstance(s, compat_str):
428 # Legacy code that uses byte strings
429 # Uncomment the following line after fixing all post processors
430 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
431 s = s.decode('ascii')
432 return encodeFilename(s, True)
435 def decodeOption(optval):
438 if isinstance(optval, bytes):
439 optval = optval.decode(preferredencoding())
441 assert isinstance(optval, compat_str)
444 def formatSeconds(secs):
446 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
448 return '%d:%02d' % (secs // 60, secs % 60)
453 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
454 if sys.version_info < (3, 2):
457 class HTTPSConnectionV3(httplib.HTTPSConnection):
458 def __init__(self, *args, **kwargs):
459 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
462 sock = socket.create_connection((self.host, self.port), self.timeout)
463 if getattr(self, '_tunnel_host', False):
467 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
469 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
471 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
472 def https_open(self, req):
473 return self.do_open(HTTPSConnectionV3, req)
474 return HTTPSHandlerV3(**kwargs)
475 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
476 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
477 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
478 if opts_no_check_certificate:
479 context.verify_mode = ssl.CERT_NONE
480 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
482 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
483 context.verify_mode = (ssl.CERT_NONE
484 if opts_no_check_certificate
485 else ssl.CERT_REQUIRED)
486 context.set_default_verify_paths()
488 context.load_default_certs()
489 except AttributeError:
491 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
493 class ExtractorError(Exception):
494 """Error during info extraction."""
495 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
496 """ tb, if given, is the original traceback (so that it can be printed out).
497 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
500 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
502 if video_id is not None:
503 msg = video_id + ': ' + msg
505 msg += u' (caused by %r)' % cause
507 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
508 super(ExtractorError, self).__init__(msg)
511 self.exc_info = sys.exc_info() # preserve original exception
513 self.video_id = video_id
515 def format_traceback(self):
516 if self.traceback is None:
518 return u''.join(traceback.format_tb(self.traceback))
521 class RegexNotFoundError(ExtractorError):
522 """Error when a regex didn't match"""
526 class DownloadError(Exception):
527 """Download Error exception.
529 This exception may be thrown by FileDownloader objects if they are not
530 configured to continue on errors. They will contain the appropriate
533 def __init__(self, msg, exc_info=None):
534 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
535 super(DownloadError, self).__init__(msg)
536 self.exc_info = exc_info
539 class SameFileError(Exception):
540 """Same File exception.
542 This exception will be thrown by FileDownloader objects if they detect
543 multiple files would have to be downloaded to the same file on disk.
548 class PostProcessingError(Exception):
549 """Post Processing exception.
551 This exception may be raised by PostProcessor's .run() method to
552 indicate an error in the postprocessing task.
554 def __init__(self, msg):
557 class MaxDownloadsReached(Exception):
558 """ --max-downloads limit has been reached. """
562 class UnavailableVideoError(Exception):
563 """Unavailable Format exception.
565 This exception will be thrown when a video is requested
566 in a format that is not available for that video.
571 class ContentTooShortError(Exception):
572 """Content Too Short exception.
574 This exception may be raised by FileDownloader objects when a file they
575 download is too small for what the server announced first, indicating
576 the connection was probably interrupted.
582 def __init__(self, downloaded, expected):
583 self.downloaded = downloaded
584 self.expected = expected
586 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
587 """Handler for HTTP requests and responses.
589 This class, when installed with an OpenerDirector, automatically adds
590 the standard headers to every HTTP request and handles gzipped and
591 deflated responses from web servers. If compression is to be avoided in
592 a particular request, the original request in the program code only has
593 to include the HTTP header "Youtubedl-No-Compression", which will be
594 removed before making the real request.
596 Part of this code was copied from:
598 http://techknack.net/python-urllib2-handlers/
600 Andrew Rowls, the author of that code, agreed to release it to the
607 return zlib.decompress(data, -zlib.MAX_WBITS)
609 return zlib.decompress(data)
612 def addinfourl_wrapper(stream, headers, url, code):
613 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
614 return compat_urllib_request.addinfourl(stream, headers, url, code)
615 ret = compat_urllib_request.addinfourl(stream, headers, url)
619 def http_request(self, req):
620 for h, v in std_headers.items():
621 if h not in req.headers:
623 if 'Youtubedl-no-compression' in req.headers:
624 if 'Accept-encoding' in req.headers:
625 del req.headers['Accept-encoding']
626 del req.headers['Youtubedl-no-compression']
627 if 'Youtubedl-user-agent' in req.headers:
628 if 'User-agent' in req.headers:
629 del req.headers['User-agent']
630 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
631 del req.headers['Youtubedl-user-agent']
633 if sys.version_info < (2, 7) and '#' in req.get_full_url():
634 # Python 2.6 is brain-dead when it comes to fragments
635 req._Request__original = req._Request__original.partition('#')[0]
636 req._Request__r_type = req._Request__r_type.partition('#')[0]
640 def http_response(self, req, resp):
643 if resp.headers.get('Content-encoding', '') == 'gzip':
644 content = resp.read()
645 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
647 uncompressed = io.BytesIO(gz.read())
648 except IOError as original_ioerror:
649 # There may be junk add the end of the file
650 # See http://stackoverflow.com/q/4928560/35070 for details
651 for i in range(1, 1024):
653 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
654 uncompressed = io.BytesIO(gz.read())
659 raise original_ioerror
660 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
661 resp.msg = old_resp.msg
663 if resp.headers.get('Content-encoding', '') == 'deflate':
664 gz = io.BytesIO(self.deflate(resp.read()))
665 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
666 resp.msg = old_resp.msg
669 https_request = http_request
670 https_response = http_response
673 def parse_iso8601(date_str, delimiter='T'):
674 """ Return a UNIX timestamp from the given date """
680 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
683 timezone = datetime.timedelta()
685 date_str = date_str[:-len(m.group(0))]
686 if not m.group('sign'):
687 timezone = datetime.timedelta()
689 sign = 1 if m.group('sign') == '+' else -1
690 timezone = datetime.timedelta(
691 hours=sign * int(m.group('hours')),
692 minutes=sign * int(m.group('minutes')))
693 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
694 dt = datetime.datetime.strptime(date_str, date_format) - timezone
695 return calendar.timegm(dt.timetuple())
698 def unified_strdate(date_str):
699 """Return a string with the date in the format YYYYMMDD"""
706 date_str = date_str.replace(',', ' ')
707 # %z (UTC offset) is only supported in python>=3.2
708 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
709 format_expressions = [
714 '%b %dst %Y %I:%M%p',
715 '%b %dnd %Y %I:%M%p',
716 '%b %dth %Y %I:%M%p',
725 '%Y-%m-%d %H:%M:%S.%f',
728 '%Y-%m-%dT%H:%M:%SZ',
729 '%Y-%m-%dT%H:%M:%S.%fZ',
730 '%Y-%m-%dT%H:%M:%S.%f0Z',
732 '%Y-%m-%dT%H:%M:%S.%f',
735 for expression in format_expressions:
737 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
740 if upload_date is None:
741 timetuple = email.utils.parsedate_tz(date_str)
743 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
746 def determine_ext(url, default_ext=u'unknown_video'):
749 guess = url.partition(u'?')[0].rpartition(u'.')[2]
750 if re.match(r'^[A-Za-z0-9]+$', guess):
755 def subtitles_filename(filename, sub_lang, sub_format):
756 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
758 def date_from_str(date_str):
760 Return a datetime object from a string in the format YYYYMMDD or
761 (now|today)[+-][0-9](day|week|month|year)(s)?"""
762 today = datetime.date.today()
763 if date_str == 'now'or date_str == 'today':
765 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
766 if match is not None:
767 sign = match.group('sign')
768 time = int(match.group('time'))
771 unit = match.group('unit')
780 delta = datetime.timedelta(**{unit: time})
782 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
784 def hyphenate_date(date_str):
786 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
787 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
788 if match is not None:
789 return '-'.join(match.groups())
793 class DateRange(object):
794 """Represents a time interval between two dates"""
795 def __init__(self, start=None, end=None):
796 """start and end must be strings in the format accepted by date"""
797 if start is not None:
798 self.start = date_from_str(start)
800 self.start = datetime.datetime.min.date()
802 self.end = date_from_str(end)
804 self.end = datetime.datetime.max.date()
805 if self.start > self.end:
806 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
809 """Returns a range that only contains the given day"""
811 def __contains__(self, date):
812 """Check if the date is in the range"""
813 if not isinstance(date, datetime.date):
814 date = date_from_str(date)
815 return self.start <= date <= self.end
817 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
821 """ Returns the platform name as a compat_str """
822 res = platform.platform()
823 if isinstance(res, bytes):
824 res = res.decode(preferredencoding())
826 assert isinstance(res, compat_str)
830 def _windows_write_string(s, out):
831 """ Returns True if the string was written using special methods,
832 False if it has yet to be written out."""
833 # Adapted from http://stackoverflow.com/a/3259271/35070
836 import ctypes.wintypes
844 fileno = out.fileno()
845 except AttributeError:
846 # If the output stream doesn't have a fileno, it's virtual
848 if fileno not in WIN_OUTPUT_IDS:
851 GetStdHandle = ctypes.WINFUNCTYPE(
852 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
853 ("GetStdHandle", ctypes.windll.kernel32))
854 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
856 WriteConsoleW = ctypes.WINFUNCTYPE(
857 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
858 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
859 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
860 written = ctypes.wintypes.DWORD(0)
862 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
863 FILE_TYPE_CHAR = 0x0002
864 FILE_TYPE_REMOTE = 0x8000
865 GetConsoleMode = ctypes.WINFUNCTYPE(
866 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
867 ctypes.POINTER(ctypes.wintypes.DWORD))(
868 ("GetConsoleMode", ctypes.windll.kernel32))
869 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
871 def not_a_console(handle):
872 if handle == INVALID_HANDLE_VALUE or handle is None:
874 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
875 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
880 def next_nonbmp_pos(s):
882 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
883 except StopIteration:
887 count = min(next_nonbmp_pos(s), 1024)
890 h, s, count if count else 2, ctypes.byref(written), None)
892 raise OSError('Failed to write string')
893 if not count: # We just wrote a non-BMP character
894 assert written.value == 2
897 assert written.value > 0
898 s = s[written.value:]
902 def write_string(s, out=None, encoding=None):
905 assert type(s) == compat_str
907 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
908 if _windows_write_string(s, out):
911 if ('b' in getattr(out, 'mode', '') or
912 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
913 byt = s.encode(encoding or preferredencoding(), 'ignore')
915 elif hasattr(out, 'buffer'):
916 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
917 byt = s.encode(enc, 'ignore')
918 out.buffer.write(byt)
924 def bytes_to_intlist(bs):
927 if isinstance(bs[0], int): # Python 3
930 return [ord(c) for c in bs]
933 def intlist_to_bytes(xs):
936 if isinstance(chr(0), bytes): # Python 2
937 return ''.join([chr(x) for x in xs])
942 # Cross-platform file locking
943 if sys.platform == 'win32':
944 import ctypes.wintypes
947 class OVERLAPPED(ctypes.Structure):
949 ('Internal', ctypes.wintypes.LPVOID),
950 ('InternalHigh', ctypes.wintypes.LPVOID),
951 ('Offset', ctypes.wintypes.DWORD),
952 ('OffsetHigh', ctypes.wintypes.DWORD),
953 ('hEvent', ctypes.wintypes.HANDLE),
956 kernel32 = ctypes.windll.kernel32
957 LockFileEx = kernel32.LockFileEx
958 LockFileEx.argtypes = [
959 ctypes.wintypes.HANDLE, # hFile
960 ctypes.wintypes.DWORD, # dwFlags
961 ctypes.wintypes.DWORD, # dwReserved
962 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
963 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
964 ctypes.POINTER(OVERLAPPED) # Overlapped
966 LockFileEx.restype = ctypes.wintypes.BOOL
967 UnlockFileEx = kernel32.UnlockFileEx
968 UnlockFileEx.argtypes = [
969 ctypes.wintypes.HANDLE, # hFile
970 ctypes.wintypes.DWORD, # dwReserved
971 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
972 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
973 ctypes.POINTER(OVERLAPPED) # Overlapped
975 UnlockFileEx.restype = ctypes.wintypes.BOOL
976 whole_low = 0xffffffff
977 whole_high = 0x7fffffff
979 def _lock_file(f, exclusive):
980 overlapped = OVERLAPPED()
981 overlapped.Offset = 0
982 overlapped.OffsetHigh = 0
983 overlapped.hEvent = 0
984 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
985 handle = msvcrt.get_osfhandle(f.fileno())
986 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
987 whole_low, whole_high, f._lock_file_overlapped_p):
988 raise OSError('Locking file failed: %r' % ctypes.FormatError())
991 assert f._lock_file_overlapped_p
992 handle = msvcrt.get_osfhandle(f.fileno())
993 if not UnlockFileEx(handle, 0,
994 whole_low, whole_high, f._lock_file_overlapped_p):
995 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1000 def _lock_file(f, exclusive):
1001 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1003 def _unlock_file(f):
1004 fcntl.flock(f, fcntl.LOCK_UN)
1007 class locked_file(object):
1008 def __init__(self, filename, mode, encoding=None):
1009 assert mode in ['r', 'a', 'w']
1010 self.f = io.open(filename, mode, encoding=encoding)
1013 def __enter__(self):
1014 exclusive = self.mode != 'r'
1016 _lock_file(self.f, exclusive)
1022 def __exit__(self, etype, value, traceback):
1024 _unlock_file(self.f)
1031 def write(self, *args):
1032 return self.f.write(*args)
1034 def read(self, *args):
1035 return self.f.read(*args)
1038 def get_filesystem_encoding():
1039 encoding = sys.getfilesystemencoding()
1040 return encoding if encoding is not None else 'utf-8'
1043 def shell_quote(args):
1045 encoding = get_filesystem_encoding()
1047 if isinstance(a, bytes):
1048 # We may get a filename encoded with 'encodeFilename'
1049 a = a.decode(encoding)
1050 quoted_args.append(pipes.quote(a))
1051 return u' '.join(quoted_args)
1054 def takewhile_inclusive(pred, seq):
1055 """ Like itertools.takewhile, but include the latest evaluated element
1056 (the first element so that Not pred(e)) """
1063 def smuggle_url(url, data):
1064 """ Pass additional data in a URL for internal use. """
1066 sdata = compat_urllib_parse.urlencode(
1067 {u'__youtubedl_smuggle': json.dumps(data)})
1068 return url + u'#' + sdata
1071 def unsmuggle_url(smug_url, default=None):
1072 if not '#__youtubedl_smuggle' in smug_url:
1073 return smug_url, default
1074 url, _, sdata = smug_url.rpartition(u'#')
1075 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1076 data = json.loads(jsond)
1080 def format_bytes(bytes):
1083 if type(bytes) is str:
1084 bytes = float(bytes)
1088 exponent = int(math.log(bytes, 1024.0))
1089 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1090 converted = float(bytes) / float(1024 ** exponent)
1091 return u'%.2f%s' % (converted, suffix)
1094 def get_term_width():
1095 columns = compat_getenv('COLUMNS', None)
1100 sp = subprocess.Popen(
1102 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1103 out, err = sp.communicate()
1104 return int(out.split()[1])
1110 def month_by_name(name):
1111 """ Return the number of a month by (locale-independently) English name """
1114 u'January', u'February', u'March', u'April', u'May', u'June',
1115 u'July', u'August', u'September', u'October', u'November', u'December']
1117 return ENGLISH_NAMES.index(name) + 1
1122 def fix_xml_ampersands(xml_str):
1123 """Replace all the '&' by '&' in XML"""
1125 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1130 def setproctitle(title):
1131 assert isinstance(title, compat_str)
1133 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1136 title_bytes = title.encode('utf-8')
1137 buf = ctypes.create_string_buffer(len(title_bytes))
1138 buf.value = title_bytes
1140 libc.prctl(15, buf, 0, 0, 0)
1141 except AttributeError:
1142 return # Strange libc, just skip this
1145 def remove_start(s, start):
1146 if s.startswith(start):
1147 return s[len(start):]
1151 def remove_end(s, end):
1153 return s[:-len(end)]
1157 def url_basename(url):
1158 path = compat_urlparse.urlparse(url).path
1159 return path.strip(u'/').split(u'/')[-1]
1162 class HEADRequest(compat_urllib_request.Request):
1163 def get_method(self):
1167 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1170 v = getattr(v, get_attr, None)
1173 return default if v is None else (int(v) * invscale // scale)
1176 def str_or_none(v, default=None):
1177 return default if v is None else compat_str(v)
1180 def str_to_int(int_str):
1181 """ A more relaxed version of int_or_none """
1184 int_str = re.sub(r'[,\.\+]', u'', int_str)
1188 def float_or_none(v, scale=1, invscale=1, default=None):
1189 return default if v is None else (float(v) * invscale / scale)
1192 def parse_duration(s):
1199 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1202 res = int(m.group('secs'))
1204 res += int(m.group('mins')) * 60
1205 if m.group('hours'):
1206 res += int(m.group('hours')) * 60 * 60
1208 res += float(m.group('ms'))
1212 def prepend_extension(filename, ext):
1213 name, real_ext = os.path.splitext(filename)
1214 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1217 def check_executable(exe, args=[]):
1218 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1219 args can be a list of arguments for a short output (like -version) """
1221 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1227 def get_exe_version(exe, args=['--version'],
1228 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1229 unrecognized=u'present'):
1230 """ Returns the version of the specified executable,
1231 or False if the executable is not present """
1233 out, err = subprocess.Popen(
1235 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1238 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1239 m = re.search(version_re, firstline)
1246 class PagedList(object):
1248 # This is only useful for tests
1249 return len(self.getslice())
1252 class OnDemandPagedList(PagedList):
1253 def __init__(self, pagefunc, pagesize):
1254 self._pagefunc = pagefunc
1255 self._pagesize = pagesize
1257 def getslice(self, start=0, end=None):
1259 for pagenum in itertools.count(start // self._pagesize):
1260 firstid = pagenum * self._pagesize
1261 nextfirstid = pagenum * self._pagesize + self._pagesize
1262 if start >= nextfirstid:
1265 page_results = list(self._pagefunc(pagenum))
1268 start % self._pagesize
1269 if firstid <= start < nextfirstid
1273 ((end - 1) % self._pagesize) + 1
1274 if (end is not None and firstid <= end <= nextfirstid)
1277 if startv != 0 or endv is not None:
1278 page_results = page_results[startv:endv]
1279 res.extend(page_results)
1281 # A little optimization - if current page is not "full", ie. does
1282 # not contain page_size videos then we can assume that this page
1283 # is the last one - there are no more ids on further pages -
1284 # i.e. no need to query again.
1285 if len(page_results) + startv < self._pagesize:
1288 # If we got the whole page, but the next page is not interesting,
1289 # break out early as well
1290 if end == nextfirstid:
1295 class InAdvancePagedList(PagedList):
1296 def __init__(self, pagefunc, pagecount, pagesize):
1297 self._pagefunc = pagefunc
1298 self._pagecount = pagecount
1299 self._pagesize = pagesize
1301 def getslice(self, start=0, end=None):
1303 start_page = start // self._pagesize
1305 self._pagecount if end is None else (end // self._pagesize + 1))
1306 skip_elems = start - start_page * self._pagesize
1307 only_more = None if end is None else end - start
1308 for pagenum in range(start_page, end_page):
1309 page = list(self._pagefunc(pagenum))
1311 page = page[skip_elems:]
1313 if only_more is not None:
1314 if len(page) < only_more:
1315 only_more -= len(page)
1317 page = page[:only_more]
1324 def uppercase_escape(s):
1325 unicode_escape = codecs.getdecoder('unicode_escape')
1327 r'\\U[0-9a-fA-F]{8}',
1328 lambda m: unicode_escape(m.group(0))[0],
1332 def escape_rfc3986(s):
1333 """Escape non-ASCII characters as suggested by RFC 3986"""
1334 if sys.version_info < (3, 0) and isinstance(s, unicode):
1335 s = s.encode('utf-8')
1336 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1339 def escape_url(url):
1340 """Escape URL as suggested by RFC 3986"""
1341 url_parsed = compat_urllib_parse_urlparse(url)
1342 return url_parsed._replace(
1343 path=escape_rfc3986(url_parsed.path),
1344 params=escape_rfc3986(url_parsed.params),
1345 query=escape_rfc3986(url_parsed.query),
1346 fragment=escape_rfc3986(url_parsed.fragment)
1350 struct.pack(u'!I', 0)
1352 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1353 def struct_pack(spec, *args):
1354 if isinstance(spec, compat_str):
1355 spec = spec.encode('ascii')
1356 return struct.pack(spec, *args)
1358 def struct_unpack(spec, *args):
1359 if isinstance(spec, compat_str):
1360 spec = spec.encode('ascii')
1361 return struct.unpack(spec, *args)
1363 struct_pack = struct.pack
1364 struct_unpack = struct.unpack
1367 def read_batch_urls(batch_fd):
1369 if not isinstance(url, compat_str):
1370 url = url.decode('utf-8', 'replace')
1371 BOM_UTF8 = u'\xef\xbb\xbf'
1372 if url.startswith(BOM_UTF8):
1373 url = url[len(BOM_UTF8):]
1375 if url.startswith(('#', ';', ']')):
1379 with contextlib.closing(batch_fd) as fd:
1380 return [url for url in map(fixup, fd) if url]
1383 def urlencode_postdata(*args, **kargs):
1384 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1388 etree_iter = xml.etree.ElementTree.Element.iter
1389 except AttributeError: # Python <=2.6
1390 etree_iter = lambda n: n.findall('.//*')
1394 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1395 def doctype(self, name, pubid, system):
1396 pass # Ignore doctypes
1398 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1399 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1400 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1401 # Fix up XML parser in Python 2.x
1402 if sys.version_info < (3, 0):
1403 for n in etree_iter(tree):
1404 if n.text is not None:
1405 if not isinstance(n.text, compat_str):
1406 n.text = n.text.decode('utf-8')
1419 def parse_age_limit(s):
1422 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1423 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1426 def strip_jsonp(code):
1427 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1430 def js_to_json(code):
1433 if v in ('true', 'false', 'null'):
1435 if v.startswith('"'):
1437 if v.startswith("'"):
1439 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1446 res = re.sub(r'''(?x)
1447 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1448 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1449 [a-zA-Z_][a-zA-Z_0-9]*
1451 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1455 def qualities(quality_ids):
1456 """ Get a numeric quality value out of a list of possible values """
1459 return quality_ids.index(qid)
1465 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1468 def limit_length(s, length):
1469 """ Add ellipses to overly long strings """
1474 return s[:length - len(ELLIPSES)] + ELLIPSES
1478 def version_tuple(v):
1479 return [int(e) for e in v.split('.')]
1482 def is_outdated_version(version, limit, assume_new=True):
1484 return not assume_new
1486 return version_tuple(version) < version_tuple(limit)
1488 return not assume_new