2 # -*- coding: utf-8 -*-
27 import xml.etree.ElementTree
31 import urllib.request as compat_urllib_request
32 except ImportError: # Python 2
33 import urllib2 as compat_urllib_request
36 import urllib.error as compat_urllib_error
37 except ImportError: # Python 2
38 import urllib2 as compat_urllib_error
41 import urllib.parse as compat_urllib_parse
42 except ImportError: # Python 2
43 import urllib as compat_urllib_parse
46 from urllib.parse import urlparse as compat_urllib_parse_urlparse
47 except ImportError: # Python 2
48 from urlparse import urlparse as compat_urllib_parse_urlparse
51 import urllib.parse as compat_urlparse
52 except ImportError: # Python 2
53 import urlparse as compat_urlparse
56 import http.cookiejar as compat_cookiejar
57 except ImportError: # Python 2
58 import cookielib as compat_cookiejar
61 import html.entities as compat_html_entities
62 except ImportError: # Python 2
63 import htmlentitydefs as compat_html_entities
66 import html.parser as compat_html_parser
67 except ImportError: # Python 2
68 import HTMLParser as compat_html_parser
71 import http.client as compat_http_client
72 except ImportError: # Python 2
73 import httplib as compat_http_client
76 from urllib.error import HTTPError as compat_HTTPError
77 except ImportError: # Python 2
78 from urllib2 import HTTPError as compat_HTTPError
81 from urllib.request import urlretrieve as compat_urlretrieve
82 except ImportError: # Python 2
83 from urllib import urlretrieve as compat_urlretrieve
87 from subprocess import DEVNULL
88 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93 from urllib.parse import parse_qs as compat_parse_qs
94 except ImportError: # Python 2
95 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
96 # Python 2's version is apparently totally broken
97 def _unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
132 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
133 encoding='utf-8', errors='replace'):
134 qs, _coerce_result = qs, unicode
135 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
137 for name_value in pairs:
138 if not name_value and not strict_parsing:
140 nv = name_value.split('=', 1)
143 raise ValueError("bad query field: %r" % (name_value,))
144 # Handle case of a control-name with no equal sign
145 if keep_blank_values:
149 if len(nv[1]) or keep_blank_values:
150 name = nv[0].replace('+', ' ')
151 name = _unquote(name, encoding=encoding, errors=errors)
152 name = _coerce_result(name)
153 value = nv[1].replace('+', ' ')
154 value = _unquote(value, encoding=encoding, errors=errors)
155 value = _coerce_result(value)
156 r.append((name, value))
159 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
160 encoding='utf-8', errors='replace'):
162 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
163 encoding=encoding, errors=errors)
164 for name, value in pairs:
165 if name in parsed_result:
166 parsed_result[name].append(value)
168 parsed_result[name] = [value]
172 compat_str = unicode # Python 2
177 compat_chr = unichr # Python 2
182 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
183 except ImportError: # Python 2.6
184 from xml.parsers.expat import ExpatError as compat_xml_parse_error
187 if type(c) is int: return c
190 # This is not clearly defined otherwise
191 compiled_regex_type = type(re.compile(''))
194 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
195 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
196 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
197 'Accept-Encoding': 'gzip, deflate',
198 'Accept-Language': 'en-us,en;q=0.5',
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
208 pref = locale.getpreferredencoding()
215 if sys.version_info < (3,0):
217 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
220 assert type(s) == type(u'')
223 # In Python 2.x, json.dump expects a bytestream.
224 # In Python 3.x, it writes to a character stream
225 if sys.version_info < (3,0):
226 def write_json_file(obj, fn):
227 with open(fn, 'wb') as f:
230 def write_json_file(obj, fn):
231 with open(fn, 'w', encoding='utf-8') as f:
234 if sys.version_info >= (2,7):
235 def find_xpath_attr(node, xpath, key, val):
236 """ Find the xpath xpath[@key=val] """
237 assert re.match(r'^[a-zA-Z]+$', key)
238 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
239 expr = xpath + u"[@%s='%s']" % (key, val)
240 return node.find(expr)
242 def find_xpath_attr(node, xpath, key, val):
243 for f in node.findall(xpath):
244 if f.attrib.get(key) == val:
248 # On python2.6 the xml.etree.ElementTree.Element methods don't support
249 # the namespace parameter
250 def xpath_with_ns(path, ns_map):
251 components = [c.split(':') for c in path.split('/')]
255 replaced.append(c[0])
258 replaced.append('{%s}%s' % (ns_map[ns], tag))
259 return '/'.join(replaced)
261 def htmlentity_transform(matchobj):
262 """Transforms an HTML entity to a character.
264 This function receives a match object and is intended to be used with
265 the re.sub() function.
267 entity = matchobj.group(1)
269 # Known non-numeric HTML entity
270 if entity in compat_html_entities.name2codepoint:
271 return compat_chr(compat_html_entities.name2codepoint[entity])
273 mobj = re.match(u'(?u)#(x?\\d+)', entity)
275 numstr = mobj.group(1)
276 if numstr.startswith(u'x'):
278 numstr = u'0%s' % numstr
281 return compat_chr(int(numstr, base))
283 # Unknown entity in name, return its literal representation
284 return (u'&%s;' % entity)
286 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
287 class BaseHTMLParser(compat_html_parser.HTMLParser):
289 compat_html_parser.HTMLParser.__init__(self)
292 def loads(self, html):
297 class AttrParser(BaseHTMLParser):
298 """Modified HTMLParser that isolates a tag with the specified attribute"""
299 def __init__(self, attribute, value):
300 self.attribute = attribute
305 self.watch_startpos = False
307 BaseHTMLParser.__init__(self)
309 def error(self, message):
310 if self.error_count > 10 or self.started:
311 raise compat_html_parser.HTMLParseError(message, self.getpos())
312 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
313 self.error_count += 1
316 def handle_starttag(self, tag, attrs):
319 self.find_startpos(None)
320 if self.attribute in attrs and attrs[self.attribute] == self.value:
323 self.watch_startpos = True
325 if not tag in self.depth: self.depth[tag] = 0
328 def handle_endtag(self, tag):
330 if tag in self.depth: self.depth[tag] -= 1
331 if self.depth[self.result[0]] == 0:
333 self.result.append(self.getpos())
335 def find_startpos(self, x):
336 """Needed to put the start position of the result (self.result[1])
337 after the opening tag with the requested id"""
338 if self.watch_startpos:
339 self.watch_startpos = False
340 self.result.append(self.getpos())
341 handle_entityref = handle_charref = handle_data = handle_comment = \
342 handle_decl = handle_pi = unknown_decl = find_startpos
344 def get_result(self):
345 if self.result is None:
347 if len(self.result) != 3:
349 lines = self.html.split('\n')
350 lines = lines[self.result[1][0]-1:self.result[2][0]]
351 lines[0] = lines[0][self.result[1][1]:]
353 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
354 lines[-1] = lines[-1][:self.result[2][1]]
355 return '\n'.join(lines).strip()
356 # Hack for https://github.com/rg3/youtube-dl/issues/662
357 if sys.version_info < (2, 7, 3):
358 AttrParser.parse_endtag = (lambda self, i:
359 i + len("</scr'+'ipt>")
360 if self.rawdata[i:].startswith("</scr'+'ipt>")
361 else compat_html_parser.HTMLParser.parse_endtag(self, i))
363 def get_element_by_id(id, html):
364 """Return the content of the tag with the specified ID in the passed HTML document"""
365 return get_element_by_attribute("id", id, html)
367 def get_element_by_attribute(attribute, value, html):
368 """Return the content of the tag with the specified attribute in the passed HTML document"""
369 parser = AttrParser(attribute, value)
372 except compat_html_parser.HTMLParseError:
374 return parser.get_result()
376 class MetaParser(BaseHTMLParser):
378 Modified HTMLParser that isolates a meta tag with the specified name
381 def __init__(self, name):
382 BaseHTMLParser.__init__(self)
387 def handle_starttag(self, tag, attrs):
391 if attrs.get('name') == self.name:
392 self.result = attrs.get('content')
394 def get_result(self):
397 def get_meta_content(name, html):
399 Return the content attribute from the meta tag with the given name attribute.
401 parser = MetaParser(name)
404 except compat_html_parser.HTMLParseError:
406 return parser.get_result()
409 def clean_html(html):
410 """Clean an HTML snippet into a readable string"""
412 html = html.replace('\n', ' ')
413 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
414 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
416 html = re.sub('<.*?>', '', html)
417 # Replace html entities
418 html = unescapeHTML(html)
422 def sanitize_open(filename, open_mode):
423 """Try to open the given filename, and slightly tweak it if this fails.
425 Attempts to open the given filename. If this fails, it tries to change
426 the filename slightly, step by step, until it's either able to open it
427 or it fails and raises a final exception, like the standard open()
430 It returns the tuple (stream, definitive_file_name).
434 if sys.platform == 'win32':
436 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
437 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, filename)
440 except (IOError, OSError) as err:
441 if err.errno in (errno.EACCES,):
444 # In case of error, try to remove win32 forbidden chars
445 alt_filename = os.path.join(
446 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
447 for path_part in os.path.split(filename)
449 if alt_filename == filename:
452 # An exception here should be caught in the caller
453 stream = open(encodeFilename(filename), open_mode)
454 return (stream, alt_filename)
457 def timeconvert(timestr):
458 """Convert RFC 2822 defined time string into system timestamp"""
460 timetuple = email.utils.parsedate_tz(timestr)
461 if timetuple is not None:
462 timestamp = email.utils.mktime_tz(timetuple)
465 def sanitize_filename(s, restricted=False, is_id=False):
466 """Sanitizes a string so it could be used as part of a filename.
467 If restricted is set, use a stricter subset of allowed characters.
468 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
470 def replace_insane(char):
471 if char == '?' or ord(char) < 32 or ord(char) == 127:
474 return '' if restricted else '\''
476 return '_-' if restricted else ' -'
477 elif char in '\\/|*<>':
479 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
481 if restricted and ord(char) > 127:
485 result = u''.join(map(replace_insane, s))
487 while '__' in result:
488 result = result.replace('__', '_')
489 result = result.strip('_')
490 # Common case of "Foreign band name - English song title"
491 if restricted and result.startswith('-_'):
497 def orderedSet(iterable):
498 """ Remove all duplicates from the input iterable """
509 assert type(s) == compat_str
511 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
515 def encodeFilename(s, for_subprocess=False):
517 @param s The name of the file
520 assert type(s) == compat_str
522 # Python 3 has a Unicode API
523 if sys.version_info >= (3, 0):
526 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
527 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
528 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
529 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
530 if not for_subprocess:
533 # For subprocess calls, encode with locale encoding
534 # Refer to http://stackoverflow.com/a/9951851/35070
535 encoding = preferredencoding()
537 encoding = sys.getfilesystemencoding()
540 return s.encode(encoding, 'ignore')
543 def decodeOption(optval):
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
549 assert isinstance(optval, compat_str)
552 def formatSeconds(secs):
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
556 return '%d:%02d' % (secs // 60, secs % 60)
561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
562 if sys.version_info < (3, 2):
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
570 sock = socket.create_connection((self.host, self.port), self.timeout)
571 if getattr(self, '_tunnel_host', False):
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
582 return HTTPSHandlerV3(**kwargs)
584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
585 context.verify_mode = (ssl.CERT_NONE
586 if opts_no_check_certificate
587 else ssl.CERT_REQUIRED)
588 context.set_default_verify_paths()
590 context.load_default_certs()
591 except AttributeError:
593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
595 class ExtractorError(Exception):
596 """Error during info extraction."""
597 def __init__(self, msg, tb=None, expected=False, cause=None):
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
605 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
606 super(ExtractorError, self).__init__(msg)
609 self.exc_info = sys.exc_info() # preserve original exception
612 def format_traceback(self):
613 if self.traceback is None:
615 return u''.join(traceback.format_tb(self.traceback))
618 class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
623 class DownloadError(Exception):
624 """Download Error exception.
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
636 class SameFileError(Exception):
637 """Same File exception.
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
645 class PostProcessingError(Exception):
646 """Post Processing exception.
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
651 def __init__(self, msg):
654 class MaxDownloadsReached(Exception):
655 """ --max-downloads limit has been reached. """
659 class UnavailableVideoError(Exception):
660 """Unavailable Format exception.
662 This exception will be thrown when a video is requested
663 in a format that is not available for that video.
668 class ContentTooShortError(Exception):
669 """Content Too Short exception.
671 This exception may be raised by FileDownloader objects when a file they
672 download is too small for what the server announced first, indicating
673 the connection was probably interrupted.
679 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
684 """Handler for HTTP requests and responses.
686 This class, when installed with an OpenerDirector, automatically adds
687 the standard headers to every HTTP request and handles gzipped and
688 deflated responses from web servers. If compression is to be avoided in
689 a particular request, the original request in the program code only has
690 to include the HTTP header "Youtubedl-No-Compression", which will be
691 removed before making the real request.
693 Part of this code was copied from:
695 http://techknack.net/python-urllib2-handlers/
697 Andrew Rowls, the author of that code, agreed to release it to the
704 return zlib.decompress(data, -zlib.MAX_WBITS)
706 return zlib.decompress(data)
709 def addinfourl_wrapper(stream, headers, url, code):
710 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
711 return compat_urllib_request.addinfourl(stream, headers, url, code)
712 ret = compat_urllib_request.addinfourl(stream, headers, url)
716 def http_request(self, req):
717 for h,v in std_headers.items():
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
725 if 'Youtubedl-user-agent' in req.headers:
726 if 'User-agent' in req.headers:
727 del req.headers['User-agent']
728 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
729 del req.headers['Youtubedl-user-agent']
732 def http_response(self, req, resp):
735 if resp.headers.get('Content-encoding', '') == 'gzip':
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
753 resp.msg = old_resp.msg
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
761 https_request = http_request
762 https_response = http_response
765 def parse_iso8601(date_str):
766 """ Return a UNIX timestamp from the given date """
772 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
775 timezone = datetime.timedelta()
777 date_str = date_str[:-len(m.group(0))]
778 if not m.group('sign'):
779 timezone = datetime.timedelta()
781 sign = 1 if m.group('sign') == '+' else -1
782 timezone = datetime.timedelta(
783 hours=sign * int(m.group('hours')),
784 minutes=sign * int(m.group('minutes')))
786 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
787 return calendar.timegm(dt.timetuple())
790 def unified_strdate(date_str):
791 """Return a string with the date in the format YYYYMMDD"""
798 date_str = date_str.replace(',', ' ')
799 # %z (UTC offset) is only supported in python>=3.2
800 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
801 format_expressions = [
813 '%Y-%m-%dT%H:%M:%SZ',
814 '%Y-%m-%dT%H:%M:%S.%fZ',
815 '%Y-%m-%dT%H:%M:%S.%f0Z',
817 '%Y-%m-%dT%H:%M:%S.%f',
820 for expression in format_expressions:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
831 def determine_ext(url, default_ext=u'unknown_video'):
832 guess = url.partition(u'?')[0].rpartition(u'.')[2]
833 if re.match(r'^[A-Za-z0-9]+$', guess):
838 def subtitles_filename(filename, sub_lang, sub_format):
839 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
841 def date_from_str(date_str):
843 Return a datetime object from a string in the format YYYYMMDD or
844 (now|today)[+-][0-9](day|week|month|year)(s)?"""
845 today = datetime.date.today()
846 if date_str == 'now'or date_str == 'today':
848 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
849 if match is not None:
850 sign = match.group('sign')
851 time = int(match.group('time'))
854 unit = match.group('unit')
863 delta = datetime.timedelta(**{unit: time})
865 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
867 def hyphenate_date(date_str):
869 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
870 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
871 if match is not None:
872 return '-'.join(match.groups())
876 class DateRange(object):
877 """Represents a time interval between two dates"""
878 def __init__(self, start=None, end=None):
879 """start and end must be strings in the format accepted by date"""
880 if start is not None:
881 self.start = date_from_str(start)
883 self.start = datetime.datetime.min.date()
885 self.end = date_from_str(end)
887 self.end = datetime.datetime.max.date()
888 if self.start > self.end:
889 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
892 """Returns a range that only contains the given day"""
894 def __contains__(self, date):
895 """Check if the date is in the range"""
896 if not isinstance(date, datetime.date):
897 date = date_from_str(date)
898 return self.start <= date <= self.end
900 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
904 """ Returns the platform name as a compat_str """
905 res = platform.platform()
906 if isinstance(res, bytes):
907 res = res.decode(preferredencoding())
909 assert isinstance(res, compat_str)
913 def write_string(s, out=None):
916 assert type(s) == compat_str
918 if ('b' in getattr(out, 'mode', '') or
919 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
920 s = s.encode(preferredencoding(), 'ignore')
923 except UnicodeEncodeError:
924 # In Windows shells, this can fail even when the codec is just charmap!?
925 # See https://wiki.python.org/moin/PrintFails#Issue
926 if sys.platform == 'win32' and hasattr(out, 'encoding'):
927 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
935 def bytes_to_intlist(bs):
938 if isinstance(bs[0], int): # Python 3
941 return [ord(c) for c in bs]
944 def intlist_to_bytes(xs):
947 if isinstance(chr(0), bytes): # Python 2
948 return ''.join([chr(x) for x in xs])
953 def get_cachedir(params={}):
954 cache_root = os.environ.get('XDG_CACHE_HOME',
955 os.path.expanduser('~/.cache'))
956 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
959 # Cross-platform file locking
960 if sys.platform == 'win32':
961 import ctypes.wintypes
964 class OVERLAPPED(ctypes.Structure):
966 ('Internal', ctypes.wintypes.LPVOID),
967 ('InternalHigh', ctypes.wintypes.LPVOID),
968 ('Offset', ctypes.wintypes.DWORD),
969 ('OffsetHigh', ctypes.wintypes.DWORD),
970 ('hEvent', ctypes.wintypes.HANDLE),
973 kernel32 = ctypes.windll.kernel32
974 LockFileEx = kernel32.LockFileEx
975 LockFileEx.argtypes = [
976 ctypes.wintypes.HANDLE, # hFile
977 ctypes.wintypes.DWORD, # dwFlags
978 ctypes.wintypes.DWORD, # dwReserved
979 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
980 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
981 ctypes.POINTER(OVERLAPPED) # Overlapped
983 LockFileEx.restype = ctypes.wintypes.BOOL
984 UnlockFileEx = kernel32.UnlockFileEx
985 UnlockFileEx.argtypes = [
986 ctypes.wintypes.HANDLE, # hFile
987 ctypes.wintypes.DWORD, # dwReserved
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
990 ctypes.POINTER(OVERLAPPED) # Overlapped
992 UnlockFileEx.restype = ctypes.wintypes.BOOL
993 whole_low = 0xffffffff
994 whole_high = 0x7fffffff
996 def _lock_file(f, exclusive):
997 overlapped = OVERLAPPED()
998 overlapped.Offset = 0
999 overlapped.OffsetHigh = 0
1000 overlapped.hEvent = 0
1001 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1002 handle = msvcrt.get_osfhandle(f.fileno())
1003 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1004 whole_low, whole_high, f._lock_file_overlapped_p):
1005 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1007 def _unlock_file(f):
1008 assert f._lock_file_overlapped_p
1009 handle = msvcrt.get_osfhandle(f.fileno())
1010 if not UnlockFileEx(handle, 0,
1011 whole_low, whole_high, f._lock_file_overlapped_p):
1012 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1017 def _lock_file(f, exclusive):
1018 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1020 def _unlock_file(f):
1021 fcntl.lockf(f, fcntl.LOCK_UN)
1024 class locked_file(object):
1025 def __init__(self, filename, mode, encoding=None):
1026 assert mode in ['r', 'a', 'w']
1027 self.f = io.open(filename, mode, encoding=encoding)
1030 def __enter__(self):
1031 exclusive = self.mode != 'r'
1033 _lock_file(self.f, exclusive)
1039 def __exit__(self, etype, value, traceback):
1041 _unlock_file(self.f)
1048 def write(self, *args):
1049 return self.f.write(*args)
1051 def read(self, *args):
1052 return self.f.read(*args)
1055 def shell_quote(args):
1057 encoding = sys.getfilesystemencoding()
1058 if encoding is None:
1061 if isinstance(a, bytes):
1062 # We may get a filename encoded with 'encodeFilename'
1063 a = a.decode(encoding)
1064 quoted_args.append(pipes.quote(a))
1065 return u' '.join(quoted_args)
1068 def takewhile_inclusive(pred, seq):
1069 """ Like itertools.takewhile, but include the latest evaluated element
1070 (the first element so that Not pred(e)) """
1077 def smuggle_url(url, data):
1078 """ Pass additional data in a URL for internal use. """
1080 sdata = compat_urllib_parse.urlencode(
1081 {u'__youtubedl_smuggle': json.dumps(data)})
1082 return url + u'#' + sdata
1085 def unsmuggle_url(smug_url, default=None):
1086 if not '#__youtubedl_smuggle' in smug_url:
1087 return smug_url, default
1088 url, _, sdata = smug_url.rpartition(u'#')
1089 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1090 data = json.loads(jsond)
1094 def format_bytes(bytes):
1097 if type(bytes) is str:
1098 bytes = float(bytes)
1102 exponent = int(math.log(bytes, 1024.0))
1103 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1104 converted = float(bytes) / float(1024 ** exponent)
1105 return u'%.2f%s' % (converted, suffix)
1108 def str_to_int(int_str):
1109 int_str = re.sub(r'[,\.]', u'', int_str)
1113 def get_term_width():
1114 columns = os.environ.get('COLUMNS', None)
1119 sp = subprocess.Popen(
1121 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1122 out, err = sp.communicate()
1123 return int(out.split()[1])
1129 def month_by_name(name):
1130 """ Return the number of a month by (locale-independently) English name """
1133 u'January', u'February', u'March', u'April', u'May', u'June',
1134 u'July', u'August', u'September', u'October', u'November', u'December']
1136 return ENGLISH_NAMES.index(name) + 1
1141 def fix_xml_ampersands(xml_str):
1142 """Replace all the '&' by '&' in XML"""
1144 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1149 def setproctitle(title):
1150 assert isinstance(title, compat_str)
1152 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1155 title_bytes = title.encode('utf-8')
1156 buf = ctypes.create_string_buffer(len(title_bytes))
1157 buf.value = title_bytes
1159 libc.prctl(15, buf, 0, 0, 0)
1160 except AttributeError:
1161 return # Strange libc, just skip this
1164 def remove_start(s, start):
1165 if s.startswith(start):
1166 return s[len(start):]
1170 def url_basename(url):
1171 path = compat_urlparse.urlparse(url).path
1172 return path.strip(u'/').split(u'/')[-1]
1175 class HEADRequest(compat_urllib_request.Request):
1176 def get_method(self):
1180 def int_or_none(v, scale=1):
1181 return v if v is None else (int(v) // scale)
1184 def parse_duration(s):
1189 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1192 res = int(m.group('secs'))
1194 res += int(m.group('mins')) * 60
1195 if m.group('hours'):
1196 res += int(m.group('hours')) * 60 * 60
1200 def prepend_extension(filename, ext):
1201 name, real_ext = os.path.splitext(filename)
1202 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1205 def check_executable(exe, args=[]):
1206 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1207 args can be a list of arguments for a short output (like -version) """
1209 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1215 class PagedList(object):
1216 def __init__(self, pagefunc, pagesize):
1217 self._pagefunc = pagefunc
1218 self._pagesize = pagesize
1221 # This is only useful for tests
1222 return len(self.getslice())
1224 def getslice(self, start=0, end=None):
1226 for pagenum in itertools.count(start // self._pagesize):
1227 firstid = pagenum * self._pagesize
1228 nextfirstid = pagenum * self._pagesize + self._pagesize
1229 if start >= nextfirstid:
1232 page_results = list(self._pagefunc(pagenum))
1235 start % self._pagesize
1236 if firstid <= start < nextfirstid
1240 ((end - 1) % self._pagesize) + 1
1241 if (end is not None and firstid <= end <= nextfirstid)
1244 if startv != 0 or endv is not None:
1245 page_results = page_results[startv:endv]
1246 res.extend(page_results)
1248 # A little optimization - if current page is not "full", ie. does
1249 # not contain page_size videos then we can assume that this page
1250 # is the last one - there are no more ids on further pages -
1251 # i.e. no need to query again.
1252 if len(page_results) + startv < self._pagesize:
1255 # If we got the whole page, but the next page is not interesting,
1256 # break out early as well
1257 if end == nextfirstid:
1262 def uppercase_escape(s):
1264 r'\\U([0-9a-fA-F]{8})',
1265 lambda m: compat_chr(int(m.group(1), base=16)), s)
1268 struct.pack(u'!I', 0)
1270 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1271 def struct_pack(spec, *args):
1272 if isinstance(spec, compat_str):
1273 spec = spec.encode('ascii')
1274 return struct.pack(spec, *args)
1276 def struct_unpack(spec, *args):
1277 if isinstance(spec, compat_str):
1278 spec = spec.encode('ascii')
1279 return struct.unpack(spec, *args)
1281 struct_pack = struct.pack
1282 struct_unpack = struct.unpack
1285 def read_batch_urls(batch_fd):
1287 if not isinstance(url, compat_str):
1288 url = url.decode('utf-8', 'replace')
1289 BOM_UTF8 = u'\xef\xbb\xbf'
1290 if url.startswith(BOM_UTF8):
1291 url = url[len(BOM_UTF8):]
1293 if url.startswith(('#', ';', ']')):
1297 with contextlib.closing(batch_fd) as fd:
1298 return [url for url in map(fixup, fd) if url]
1301 def urlencode_postdata(*args, **kargs):
1302 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1306 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1307 def doctype(self, name, pubid, system):
1308 pass # Ignore doctypes
1310 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1311 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1312 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1315 if sys.version_info < (3, 0) and sys.platform == 'win32':
1316 def compat_getpass(prompt, *args, **kwargs):
1317 if isinstance(prompt, compat_str):
1318 prompt = prompt.encode(preferredencoding())
1319 return getpass.getpass(prompt, *args, **kwargs)
1321 compat_getpass = getpass.getpass