2 # -*- coding: utf-8 -*-
27 import xml.etree.ElementTree
31 import urllib.request as compat_urllib_request
32 except ImportError: # Python 2
33 import urllib2 as compat_urllib_request
36 import urllib.error as compat_urllib_error
37 except ImportError: # Python 2
38 import urllib2 as compat_urllib_error
41 import urllib.parse as compat_urllib_parse
42 except ImportError: # Python 2
43 import urllib as compat_urllib_parse
46 from urllib.parse import urlparse as compat_urllib_parse_urlparse
47 except ImportError: # Python 2
48 from urlparse import urlparse as compat_urllib_parse_urlparse
51 import urllib.parse as compat_urlparse
52 except ImportError: # Python 2
53 import urlparse as compat_urlparse
56 import http.cookiejar as compat_cookiejar
57 except ImportError: # Python 2
58 import cookielib as compat_cookiejar
61 import html.entities as compat_html_entities
62 except ImportError: # Python 2
63 import htmlentitydefs as compat_html_entities
66 import html.parser as compat_html_parser
67 except ImportError: # Python 2
68 import HTMLParser as compat_html_parser
71 import http.client as compat_http_client
72 except ImportError: # Python 2
73 import httplib as compat_http_client
76 from urllib.error import HTTPError as compat_HTTPError
77 except ImportError: # Python 2
78 from urllib2 import HTTPError as compat_HTTPError
81 from urllib.request import urlretrieve as compat_urlretrieve
82 except ImportError: # Python 2
83 from urllib import urlretrieve as compat_urlretrieve
87 from subprocess import DEVNULL
88 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93 from urllib.parse import parse_qs as compat_parse_qs
94 except ImportError: # Python 2
95 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
96 # Python 2's version is apparently totally broken
97 def _unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
132 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
133 encoding='utf-8', errors='replace'):
134 qs, _coerce_result = qs, unicode
135 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
137 for name_value in pairs:
138 if not name_value and not strict_parsing:
140 nv = name_value.split('=', 1)
143 raise ValueError("bad query field: %r" % (name_value,))
144 # Handle case of a control-name with no equal sign
145 if keep_blank_values:
149 if len(nv[1]) or keep_blank_values:
150 name = nv[0].replace('+', ' ')
151 name = _unquote(name, encoding=encoding, errors=errors)
152 name = _coerce_result(name)
153 value = nv[1].replace('+', ' ')
154 value = _unquote(value, encoding=encoding, errors=errors)
155 value = _coerce_result(value)
156 r.append((name, value))
159 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
160 encoding='utf-8', errors='replace'):
162 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
163 encoding=encoding, errors=errors)
164 for name, value in pairs:
165 if name in parsed_result:
166 parsed_result[name].append(value)
168 parsed_result[name] = [value]
172 compat_str = unicode # Python 2
177 compat_chr = unichr # Python 2
182 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
183 except ImportError: # Python 2.6
184 from xml.parsers.expat import ExpatError as compat_xml_parse_error
187 if type(c) is int: return c
190 # This is not clearly defined otherwise
191 compiled_regex_type = type(re.compile(''))
194 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
195 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
196 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
197 'Accept-Encoding': 'gzip, deflate',
198 'Accept-Language': 'en-us,en;q=0.5',
201 def preferredencoding():
202 """Get preferred encoding.
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
208 pref = locale.getpreferredencoding()
215 if sys.version_info < (3,0):
217 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
220 assert type(s) == type(u'')
223 # In Python 2.x, json.dump expects a bytestream.
224 # In Python 3.x, it writes to a character stream
225 if sys.version_info < (3,0):
226 def write_json_file(obj, fn):
227 with open(fn, 'wb') as f:
230 def write_json_file(obj, fn):
231 with open(fn, 'w', encoding='utf-8') as f:
234 if sys.version_info >= (2,7):
235 def find_xpath_attr(node, xpath, key, val):
236 """ Find the xpath xpath[@key=val] """
237 assert re.match(r'^[a-zA-Z]+$', key)
238 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
239 expr = xpath + u"[@%s='%s']" % (key, val)
240 return node.find(expr)
242 def find_xpath_attr(node, xpath, key, val):
243 for f in node.findall(xpath):
244 if f.attrib.get(key) == val:
248 # On python2.6 the xml.etree.ElementTree.Element methods don't support
249 # the namespace parameter
250 def xpath_with_ns(path, ns_map):
251 components = [c.split(':') for c in path.split('/')]
255 replaced.append(c[0])
258 replaced.append('{%s}%s' % (ns_map[ns], tag))
259 return '/'.join(replaced)
261 def htmlentity_transform(matchobj):
262 """Transforms an HTML entity to a character.
264 This function receives a match object and is intended to be used with
265 the re.sub() function.
267 entity = matchobj.group(1)
269 # Known non-numeric HTML entity
270 if entity in compat_html_entities.name2codepoint:
271 return compat_chr(compat_html_entities.name2codepoint[entity])
273 mobj = re.match(u'(?u)#(x?\\d+)', entity)
275 numstr = mobj.group(1)
276 if numstr.startswith(u'x'):
278 numstr = u'0%s' % numstr
281 return compat_chr(int(numstr, base))
283 # Unknown entity in name, return its literal representation
284 return (u'&%s;' % entity)
286 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
287 class BaseHTMLParser(compat_html_parser.HTMLParser):
289 compat_html_parser.HTMLParser.__init__(self)
292 def loads(self, html):
297 class AttrParser(BaseHTMLParser):
298 """Modified HTMLParser that isolates a tag with the specified attribute"""
299 def __init__(self, attribute, value):
300 self.attribute = attribute
305 self.watch_startpos = False
307 BaseHTMLParser.__init__(self)
309 def error(self, message):
310 if self.error_count > 10 or self.started:
311 raise compat_html_parser.HTMLParseError(message, self.getpos())
312 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
313 self.error_count += 1
316 def handle_starttag(self, tag, attrs):
319 self.find_startpos(None)
320 if self.attribute in attrs and attrs[self.attribute] == self.value:
323 self.watch_startpos = True
325 if not tag in self.depth: self.depth[tag] = 0
328 def handle_endtag(self, tag):
330 if tag in self.depth: self.depth[tag] -= 1
331 if self.depth[self.result[0]] == 0:
333 self.result.append(self.getpos())
335 def find_startpos(self, x):
336 """Needed to put the start position of the result (self.result[1])
337 after the opening tag with the requested id"""
338 if self.watch_startpos:
339 self.watch_startpos = False
340 self.result.append(self.getpos())
341 handle_entityref = handle_charref = handle_data = handle_comment = \
342 handle_decl = handle_pi = unknown_decl = find_startpos
344 def get_result(self):
345 if self.result is None:
347 if len(self.result) != 3:
349 lines = self.html.split('\n')
350 lines = lines[self.result[1][0]-1:self.result[2][0]]
351 lines[0] = lines[0][self.result[1][1]:]
353 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
354 lines[-1] = lines[-1][:self.result[2][1]]
355 return '\n'.join(lines).strip()
356 # Hack for https://github.com/rg3/youtube-dl/issues/662
357 if sys.version_info < (2, 7, 3):
358 AttrParser.parse_endtag = (lambda self, i:
359 i + len("</scr'+'ipt>")
360 if self.rawdata[i:].startswith("</scr'+'ipt>")
361 else compat_html_parser.HTMLParser.parse_endtag(self, i))
363 def get_element_by_id(id, html):
364 """Return the content of the tag with the specified ID in the passed HTML document"""
365 return get_element_by_attribute("id", id, html)
367 def get_element_by_attribute(attribute, value, html):
368 """Return the content of the tag with the specified attribute in the passed HTML document"""
369 parser = AttrParser(attribute, value)
372 except compat_html_parser.HTMLParseError:
374 return parser.get_result()
376 class MetaParser(BaseHTMLParser):
378 Modified HTMLParser that isolates a meta tag with the specified name
381 def __init__(self, name):
382 BaseHTMLParser.__init__(self)
387 def handle_starttag(self, tag, attrs):
391 if attrs.get('name') == self.name:
392 self.result = attrs.get('content')
394 def get_result(self):
397 def get_meta_content(name, html):
399 Return the content attribute from the meta tag with the given name attribute.
401 parser = MetaParser(name)
404 except compat_html_parser.HTMLParseError:
406 return parser.get_result()
409 def clean_html(html):
410 """Clean an HTML snippet into a readable string"""
412 html = html.replace('\n', ' ')
413 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
414 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
416 html = re.sub('<.*?>', '', html)
417 # Replace html entities
418 html = unescapeHTML(html)
422 def sanitize_open(filename, open_mode):
423 """Try to open the given filename, and slightly tweak it if this fails.
425 Attempts to open the given filename. If this fails, it tries to change
426 the filename slightly, step by step, until it's either able to open it
427 or it fails and raises a final exception, like the standard open()
430 It returns the tuple (stream, definitive_file_name).
434 if sys.platform == 'win32':
436 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
437 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, filename)
440 except (IOError, OSError) as err:
441 if err.errno in (errno.EACCES,):
444 # In case of error, try to remove win32 forbidden chars
445 alt_filename = os.path.join(
446 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
447 for path_part in os.path.split(filename)
449 if alt_filename == filename:
452 # An exception here should be caught in the caller
453 stream = open(encodeFilename(filename), open_mode)
454 return (stream, alt_filename)
457 def timeconvert(timestr):
458 """Convert RFC 2822 defined time string into system timestamp"""
460 timetuple = email.utils.parsedate_tz(timestr)
461 if timetuple is not None:
462 timestamp = email.utils.mktime_tz(timetuple)
465 def sanitize_filename(s, restricted=False, is_id=False):
466 """Sanitizes a string so it could be used as part of a filename.
467 If restricted is set, use a stricter subset of allowed characters.
468 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
470 def replace_insane(char):
471 if char == '?' or ord(char) < 32 or ord(char) == 127:
474 return '' if restricted else '\''
476 return '_-' if restricted else ' -'
477 elif char in '\\/|*<>':
479 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
481 if restricted and ord(char) > 127:
485 result = u''.join(map(replace_insane, s))
487 while '__' in result:
488 result = result.replace('__', '_')
489 result = result.strip('_')
490 # Common case of "Foreign band name - English song title"
491 if restricted and result.startswith('-_'):
497 def orderedSet(iterable):
498 """ Remove all duplicates from the input iterable """
509 assert type(s) == compat_str
511 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
515 def encodeFilename(s, for_subprocess=False):
517 @param s The name of the file
520 assert type(s) == compat_str
522 # Python 3 has a Unicode API
523 if sys.version_info >= (3, 0):
526 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
527 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
528 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
529 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
530 if not for_subprocess:
533 # For subprocess calls, encode with locale encoding
534 # Refer to http://stackoverflow.com/a/9951851/35070
535 encoding = preferredencoding()
537 encoding = sys.getfilesystemencoding()
540 return s.encode(encoding, 'ignore')
542 def decodeOption(optval):
545 if isinstance(optval, bytes):
546 optval = optval.decode(preferredencoding())
548 assert isinstance(optval, compat_str)
551 def formatSeconds(secs):
553 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
555 return '%d:%02d' % (secs // 60, secs % 60)
560 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
561 if sys.version_info < (3, 2):
564 class HTTPSConnectionV3(httplib.HTTPSConnection):
565 def __init__(self, *args, **kwargs):
566 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
569 sock = socket.create_connection((self.host, self.port), self.timeout)
570 if getattr(self, '_tunnel_host', False):
574 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
576 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
578 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
579 def https_open(self, req):
580 return self.do_open(HTTPSConnectionV3, req)
581 return HTTPSHandlerV3(**kwargs)
583 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
584 context.verify_mode = (ssl.CERT_NONE
585 if opts_no_check_certificate
586 else ssl.CERT_REQUIRED)
587 context.set_default_verify_paths()
589 context.load_default_certs()
590 except AttributeError:
592 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
594 class ExtractorError(Exception):
595 """Error during info extraction."""
596 def __init__(self, msg, tb=None, expected=False, cause=None):
597 """ tb, if given, is the original traceback (so that it can be printed out).
598 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
601 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
604 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
605 super(ExtractorError, self).__init__(msg)
608 self.exc_info = sys.exc_info() # preserve original exception
611 def format_traceback(self):
612 if self.traceback is None:
614 return u''.join(traceback.format_tb(self.traceback))
617 class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
622 class DownloadError(Exception):
623 """Download Error exception.
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
629 def __init__(self, msg, exc_info=None):
630 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
631 super(DownloadError, self).__init__(msg)
632 self.exc_info = exc_info
635 class SameFileError(Exception):
636 """Same File exception.
638 This exception will be thrown by FileDownloader objects if they detect
639 multiple files would have to be downloaded to the same file on disk.
644 class PostProcessingError(Exception):
645 """Post Processing exception.
647 This exception may be raised by PostProcessor's .run() method to
648 indicate an error in the postprocessing task.
650 def __init__(self, msg):
653 class MaxDownloadsReached(Exception):
654 """ --max-downloads limit has been reached. """
658 class UnavailableVideoError(Exception):
659 """Unavailable Format exception.
661 This exception will be thrown when a video is requested
662 in a format that is not available for that video.
667 class ContentTooShortError(Exception):
668 """Content Too Short exception.
670 This exception may be raised by FileDownloader objects when a file they
671 download is too small for what the server announced first, indicating
672 the connection was probably interrupted.
678 def __init__(self, downloaded, expected):
679 self.downloaded = downloaded
680 self.expected = expected
682 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
683 """Handler for HTTP requests and responses.
685 This class, when installed with an OpenerDirector, automatically adds
686 the standard headers to every HTTP request and handles gzipped and
687 deflated responses from web servers. If compression is to be avoided in
688 a particular request, the original request in the program code only has
689 to include the HTTP header "Youtubedl-No-Compression", which will be
690 removed before making the real request.
692 Part of this code was copied from:
694 http://techknack.net/python-urllib2-handlers/
696 Andrew Rowls, the author of that code, agreed to release it to the
703 return zlib.decompress(data, -zlib.MAX_WBITS)
705 return zlib.decompress(data)
708 def addinfourl_wrapper(stream, headers, url, code):
709 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
710 return compat_urllib_request.addinfourl(stream, headers, url, code)
711 ret = compat_urllib_request.addinfourl(stream, headers, url)
715 def http_request(self, req):
716 for h,v in std_headers.items():
720 if 'Youtubedl-no-compression' in req.headers:
721 if 'Accept-encoding' in req.headers:
722 del req.headers['Accept-encoding']
723 del req.headers['Youtubedl-no-compression']
724 if 'Youtubedl-user-agent' in req.headers:
725 if 'User-agent' in req.headers:
726 del req.headers['User-agent']
727 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
728 del req.headers['Youtubedl-user-agent']
731 def http_response(self, req, resp):
734 if resp.headers.get('Content-encoding', '') == 'gzip':
735 content = resp.read()
736 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
738 uncompressed = io.BytesIO(gz.read())
739 except IOError as original_ioerror:
740 # There may be junk add the end of the file
741 # See http://stackoverflow.com/q/4928560/35070 for details
742 for i in range(1, 1024):
744 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
745 uncompressed = io.BytesIO(gz.read())
750 raise original_ioerror
751 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
752 resp.msg = old_resp.msg
754 if resp.headers.get('Content-encoding', '') == 'deflate':
755 gz = io.BytesIO(self.deflate(resp.read()))
756 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
757 resp.msg = old_resp.msg
760 https_request = http_request
761 https_response = http_response
764 def parse_iso8601(date_str):
765 """ Return a UNIX timestamp from the given date """
771 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
774 timezone = datetime.timedelta()
776 date_str = date_str[:-len(m.group(0))]
777 if not m.group('sign'):
778 timezone = datetime.timedelta()
780 sign = 1 if m.group('sign') == '+' else -1
781 timezone = datetime.timedelta(
782 hours=sign * int(m.group('hours')),
783 minutes=sign * int(m.group('minutes')))
785 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
786 return calendar.timegm(dt.timetuple())
789 def unified_strdate(date_str):
790 """Return a string with the date in the format YYYYMMDD"""
797 date_str = date_str.replace(',', ' ')
798 # %z (UTC offset) is only supported in python>=3.2
799 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
800 format_expressions = [
812 '%Y-%m-%dT%H:%M:%SZ',
813 '%Y-%m-%dT%H:%M:%S.%fZ',
814 '%Y-%m-%dT%H:%M:%S.%f0Z',
816 '%Y-%m-%dT%H:%M:%S.%f',
819 for expression in format_expressions:
821 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
824 if upload_date is None:
825 timetuple = email.utils.parsedate_tz(date_str)
827 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
830 def determine_ext(url, default_ext=u'unknown_video'):
831 guess = url.partition(u'?')[0].rpartition(u'.')[2]
832 if re.match(r'^[A-Za-z0-9]+$', guess):
837 def subtitles_filename(filename, sub_lang, sub_format):
838 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
840 def date_from_str(date_str):
842 Return a datetime object from a string in the format YYYYMMDD or
843 (now|today)[+-][0-9](day|week|month|year)(s)?"""
844 today = datetime.date.today()
845 if date_str == 'now'or date_str == 'today':
847 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
848 if match is not None:
849 sign = match.group('sign')
850 time = int(match.group('time'))
853 unit = match.group('unit')
862 delta = datetime.timedelta(**{unit: time})
864 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
866 def hyphenate_date(date_str):
868 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
869 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
870 if match is not None:
871 return '-'.join(match.groups())
875 class DateRange(object):
876 """Represents a time interval between two dates"""
877 def __init__(self, start=None, end=None):
878 """start and end must be strings in the format accepted by date"""
879 if start is not None:
880 self.start = date_from_str(start)
882 self.start = datetime.datetime.min.date()
884 self.end = date_from_str(end)
886 self.end = datetime.datetime.max.date()
887 if self.start > self.end:
888 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
891 """Returns a range that only contains the given day"""
893 def __contains__(self, date):
894 """Check if the date is in the range"""
895 if not isinstance(date, datetime.date):
896 date = date_from_str(date)
897 return self.start <= date <= self.end
899 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
903 """ Returns the platform name as a compat_str """
904 res = platform.platform()
905 if isinstance(res, bytes):
906 res = res.decode(preferredencoding())
908 assert isinstance(res, compat_str)
912 def write_string(s, out=None):
915 assert type(s) == compat_str
917 if ('b' in getattr(out, 'mode', '') or
918 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
919 s = s.encode(preferredencoding(), 'ignore')
922 except UnicodeEncodeError:
923 # In Windows shells, this can fail even when the codec is just charmap!?
924 # See https://wiki.python.org/moin/PrintFails#Issue
925 if sys.platform == 'win32' and hasattr(out, 'encoding'):
926 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
934 def bytes_to_intlist(bs):
937 if isinstance(bs[0], int): # Python 3
940 return [ord(c) for c in bs]
943 def intlist_to_bytes(xs):
946 if isinstance(chr(0), bytes): # Python 2
947 return ''.join([chr(x) for x in xs])
952 def get_cachedir(params={}):
953 cache_root = os.environ.get('XDG_CACHE_HOME',
954 os.path.expanduser('~/.cache'))
955 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
958 # Cross-platform file locking
959 if sys.platform == 'win32':
960 import ctypes.wintypes
963 class OVERLAPPED(ctypes.Structure):
965 ('Internal', ctypes.wintypes.LPVOID),
966 ('InternalHigh', ctypes.wintypes.LPVOID),
967 ('Offset', ctypes.wintypes.DWORD),
968 ('OffsetHigh', ctypes.wintypes.DWORD),
969 ('hEvent', ctypes.wintypes.HANDLE),
972 kernel32 = ctypes.windll.kernel32
973 LockFileEx = kernel32.LockFileEx
974 LockFileEx.argtypes = [
975 ctypes.wintypes.HANDLE, # hFile
976 ctypes.wintypes.DWORD, # dwFlags
977 ctypes.wintypes.DWORD, # dwReserved
978 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
979 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
980 ctypes.POINTER(OVERLAPPED) # Overlapped
982 LockFileEx.restype = ctypes.wintypes.BOOL
983 UnlockFileEx = kernel32.UnlockFileEx
984 UnlockFileEx.argtypes = [
985 ctypes.wintypes.HANDLE, # hFile
986 ctypes.wintypes.DWORD, # dwReserved
987 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
989 ctypes.POINTER(OVERLAPPED) # Overlapped
991 UnlockFileEx.restype = ctypes.wintypes.BOOL
992 whole_low = 0xffffffff
993 whole_high = 0x7fffffff
995 def _lock_file(f, exclusive):
996 overlapped = OVERLAPPED()
997 overlapped.Offset = 0
998 overlapped.OffsetHigh = 0
999 overlapped.hEvent = 0
1000 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1001 handle = msvcrt.get_osfhandle(f.fileno())
1002 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1003 whole_low, whole_high, f._lock_file_overlapped_p):
1004 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1006 def _unlock_file(f):
1007 assert f._lock_file_overlapped_p
1008 handle = msvcrt.get_osfhandle(f.fileno())
1009 if not UnlockFileEx(handle, 0,
1010 whole_low, whole_high, f._lock_file_overlapped_p):
1011 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1016 def _lock_file(f, exclusive):
1017 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1019 def _unlock_file(f):
1020 fcntl.lockf(f, fcntl.LOCK_UN)
1023 class locked_file(object):
1024 def __init__(self, filename, mode, encoding=None):
1025 assert mode in ['r', 'a', 'w']
1026 self.f = io.open(filename, mode, encoding=encoding)
1029 def __enter__(self):
1030 exclusive = self.mode != 'r'
1032 _lock_file(self.f, exclusive)
1038 def __exit__(self, etype, value, traceback):
1040 _unlock_file(self.f)
1047 def write(self, *args):
1048 return self.f.write(*args)
1050 def read(self, *args):
1051 return self.f.read(*args)
1054 def shell_quote(args):
1056 encoding = sys.getfilesystemencoding()
1057 if encoding is None:
1060 if isinstance(a, bytes):
1061 # We may get a filename encoded with 'encodeFilename'
1062 a = a.decode(encoding)
1063 quoted_args.append(pipes.quote(a))
1064 return u' '.join(quoted_args)
1067 def takewhile_inclusive(pred, seq):
1068 """ Like itertools.takewhile, but include the latest evaluated element
1069 (the first element so that Not pred(e)) """
1076 def smuggle_url(url, data):
1077 """ Pass additional data in a URL for internal use. """
1079 sdata = compat_urllib_parse.urlencode(
1080 {u'__youtubedl_smuggle': json.dumps(data)})
1081 return url + u'#' + sdata
1084 def unsmuggle_url(smug_url, default=None):
1085 if not '#__youtubedl_smuggle' in smug_url:
1086 return smug_url, default
1087 url, _, sdata = smug_url.rpartition(u'#')
1088 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1089 data = json.loads(jsond)
1093 def format_bytes(bytes):
1096 if type(bytes) is str:
1097 bytes = float(bytes)
1101 exponent = int(math.log(bytes, 1024.0))
1102 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1103 converted = float(bytes) / float(1024 ** exponent)
1104 return u'%.2f%s' % (converted, suffix)
1107 def str_to_int(int_str):
1108 int_str = re.sub(r'[,\.]', u'', int_str)
1112 def get_term_width():
1113 columns = os.environ.get('COLUMNS', None)
1118 sp = subprocess.Popen(
1120 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1121 out, err = sp.communicate()
1122 return int(out.split()[1])
1128 def month_by_name(name):
1129 """ Return the number of a month by (locale-independently) English name """
1132 u'January', u'February', u'March', u'April', u'May', u'June',
1133 u'July', u'August', u'September', u'October', u'November', u'December']
1135 return ENGLISH_NAMES.index(name) + 1
1140 def fix_xml_ampersands(xml_str):
1141 """Replace all the '&' by '&' in XML"""
1143 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1148 def setproctitle(title):
1149 assert isinstance(title, compat_str)
1151 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1154 title_bytes = title.encode('utf-8')
1155 buf = ctypes.create_string_buffer(len(title_bytes))
1156 buf.value = title_bytes
1158 libc.prctl(15, buf, 0, 0, 0)
1159 except AttributeError:
1160 return # Strange libc, just skip this
1163 def remove_start(s, start):
1164 if s.startswith(start):
1165 return s[len(start):]
1169 def url_basename(url):
1170 path = compat_urlparse.urlparse(url).path
1171 return path.strip(u'/').split(u'/')[-1]
1174 class HEADRequest(compat_urllib_request.Request):
1175 def get_method(self):
1179 def int_or_none(v, scale=1, default=None):
1180 return default if v is None else (int(v) // scale)
1183 def float_or_none(v, scale=1, default=None):
1184 return default if v is None else (float(v) / scale)
1187 def parse_duration(s):
1192 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1195 res = int(m.group('secs'))
1197 res += int(m.group('mins')) * 60
1198 if m.group('hours'):
1199 res += int(m.group('hours')) * 60 * 60
1203 def prepend_extension(filename, ext):
1204 name, real_ext = os.path.splitext(filename)
1205 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1208 def check_executable(exe, args=[]):
1209 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1210 args can be a list of arguments for a short output (like -version) """
1212 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1218 class PagedList(object):
1219 def __init__(self, pagefunc, pagesize):
1220 self._pagefunc = pagefunc
1221 self._pagesize = pagesize
1224 # This is only useful for tests
1225 return len(self.getslice())
1227 def getslice(self, start=0, end=None):
1229 for pagenum in itertools.count(start // self._pagesize):
1230 firstid = pagenum * self._pagesize
1231 nextfirstid = pagenum * self._pagesize + self._pagesize
1232 if start >= nextfirstid:
1235 page_results = list(self._pagefunc(pagenum))
1238 start % self._pagesize
1239 if firstid <= start < nextfirstid
1243 ((end - 1) % self._pagesize) + 1
1244 if (end is not None and firstid <= end <= nextfirstid)
1247 if startv != 0 or endv is not None:
1248 page_results = page_results[startv:endv]
1249 res.extend(page_results)
1251 # A little optimization - if current page is not "full", ie. does
1252 # not contain page_size videos then we can assume that this page
1253 # is the last one - there are no more ids on further pages -
1254 # i.e. no need to query again.
1255 if len(page_results) + startv < self._pagesize:
1258 # If we got the whole page, but the next page is not interesting,
1259 # break out early as well
1260 if end == nextfirstid:
1265 def uppercase_escape(s):
1267 r'\\U[0-9a-fA-F]{8}',
1268 lambda m: m.group(0).decode('unicode-escape'), s)
1271 struct.pack(u'!I', 0)
1273 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1274 def struct_pack(spec, *args):
1275 if isinstance(spec, compat_str):
1276 spec = spec.encode('ascii')
1277 return struct.pack(spec, *args)
1279 def struct_unpack(spec, *args):
1280 if isinstance(spec, compat_str):
1281 spec = spec.encode('ascii')
1282 return struct.unpack(spec, *args)
1284 struct_pack = struct.pack
1285 struct_unpack = struct.unpack
1288 def read_batch_urls(batch_fd):
1290 if not isinstance(url, compat_str):
1291 url = url.decode('utf-8', 'replace')
1292 BOM_UTF8 = u'\xef\xbb\xbf'
1293 if url.startswith(BOM_UTF8):
1294 url = url[len(BOM_UTF8):]
1296 if url.startswith(('#', ';', ']')):
1300 with contextlib.closing(batch_fd) as fd:
1301 return [url for url in map(fixup, fd) if url]
1304 def urlencode_postdata(*args, **kargs):
1305 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1309 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1310 def doctype(self, name, pubid, system):
1311 pass # Ignore doctypes
1313 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1314 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1315 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1318 if sys.version_info < (3, 0) and sys.platform == 'win32':
1319 def compat_getpass(prompt, *args, **kwargs):
1320 if isinstance(prompt, compat_str):
1321 prompt = prompt.encode(preferredencoding())
1322 return getpass.getpass(prompt, *args, **kwargs)
1324 compat_getpass = getpass.getpass
1336 def strip_jsonp(code):
1337 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)