2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
94 from urllib.parse import parse_qs as compat_parse_qs
95 except ImportError: # Python 2
96 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
97 # Python 2's version is apparently totally broken
98 def _unquote(string, encoding='utf-8', errors='replace'):
101 res = string.split('%')
108 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
115 pct_sequence += item[:2].decode('hex')
118 # This segment was just a single percent-encoded character.
119 # May be part of a sequence of code units, so delay decoding.
120 # (Stored in pct_sequence).
124 # Encountered non-percent-encoded characters. Flush the current
126 string += pct_sequence.decode(encoding, errors) + rest
129 # Flush the final pct_sequence
130 string += pct_sequence.decode(encoding, errors)
133 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
134 encoding='utf-8', errors='replace'):
135 qs, _coerce_result = qs, unicode
136 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
138 for name_value in pairs:
139 if not name_value and not strict_parsing:
141 nv = name_value.split('=', 1)
144 raise ValueError("bad query field: %r" % (name_value,))
145 # Handle case of a control-name with no equal sign
146 if keep_blank_values:
150 if len(nv[1]) or keep_blank_values:
151 name = nv[0].replace('+', ' ')
152 name = _unquote(name, encoding=encoding, errors=errors)
153 name = _coerce_result(name)
154 value = nv[1].replace('+', ' ')
155 value = _unquote(value, encoding=encoding, errors=errors)
156 value = _coerce_result(value)
157 r.append((name, value))
160 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
161 encoding='utf-8', errors='replace'):
163 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
164 encoding=encoding, errors=errors)
165 for name, value in pairs:
166 if name in parsed_result:
167 parsed_result[name].append(value)
169 parsed_result[name] = [value]
173 compat_str = unicode # Python 2
178 compat_chr = unichr # Python 2
183 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
184 except ImportError: # Python 2.6
185 from xml.parsers.expat import ExpatError as compat_xml_parse_error
188 if type(c) is int: return c
191 # This is not clearly defined otherwise
192 compiled_regex_type = type(re.compile(''))
195 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
196 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
197 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
198 'Accept-Encoding': 'gzip, deflate',
199 'Accept-Language': 'en-us,en;q=0.5',
202 def preferredencoding():
203 """Get preferred encoding.
205 Returns the best encoding scheme for the system, based on
206 locale.getpreferredencoding() and some further tweaks.
209 pref = locale.getpreferredencoding()
216 if sys.version_info < (3,0):
218 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
221 assert type(s) == type(u'')
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3,0):
227 def write_json_file(obj, fn):
228 with open(fn, 'wb') as f:
231 def write_json_file(obj, fn):
232 with open(fn, 'w', encoding='utf-8') as f:
235 if sys.version_info >= (2,7):
236 def find_xpath_attr(node, xpath, key, val):
237 """ Find the xpath xpath[@key=val] """
238 assert re.match(r'^[a-zA-Z]+$', key)
239 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
240 expr = xpath + u"[@%s='%s']" % (key, val)
241 return node.find(expr)
243 def find_xpath_attr(node, xpath, key, val):
244 for f in node.findall(xpath):
245 if f.attrib.get(key) == val:
249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
250 # the namespace parameter
251 def xpath_with_ns(path, ns_map):
252 components = [c.split(':') for c in path.split('/')]
256 replaced.append(c[0])
259 replaced.append('{%s}%s' % (ns_map[ns], tag))
260 return '/'.join(replaced)
262 def htmlentity_transform(matchobj):
263 """Transforms an HTML entity to a character.
265 This function receives a match object and is intended to be used with
266 the re.sub() function.
268 entity = matchobj.group(1)
270 # Known non-numeric HTML entity
271 if entity in compat_html_entities.name2codepoint:
272 return compat_chr(compat_html_entities.name2codepoint[entity])
274 mobj = re.match(u'(?u)#(x?\\d+)', entity)
276 numstr = mobj.group(1)
277 if numstr.startswith(u'x'):
279 numstr = u'0%s' % numstr
282 return compat_chr(int(numstr, base))
284 # Unknown entity in name, return its literal representation
285 return (u'&%s;' % entity)
287 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
288 class BaseHTMLParser(compat_html_parser.HTMLParser):
290 compat_html_parser.HTMLParser.__init__(self)
293 def loads(self, html):
298 class AttrParser(BaseHTMLParser):
299 """Modified HTMLParser that isolates a tag with the specified attribute"""
300 def __init__(self, attribute, value):
301 self.attribute = attribute
306 self.watch_startpos = False
308 BaseHTMLParser.__init__(self)
310 def error(self, message):
311 if self.error_count > 10 or self.started:
312 raise compat_html_parser.HTMLParseError(message, self.getpos())
313 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
314 self.error_count += 1
317 def handle_starttag(self, tag, attrs):
320 self.find_startpos(None)
321 if self.attribute in attrs and attrs[self.attribute] == self.value:
324 self.watch_startpos = True
326 if not tag in self.depth: self.depth[tag] = 0
329 def handle_endtag(self, tag):
331 if tag in self.depth: self.depth[tag] -= 1
332 if self.depth[self.result[0]] == 0:
334 self.result.append(self.getpos())
336 def find_startpos(self, x):
337 """Needed to put the start position of the result (self.result[1])
338 after the opening tag with the requested id"""
339 if self.watch_startpos:
340 self.watch_startpos = False
341 self.result.append(self.getpos())
342 handle_entityref = handle_charref = handle_data = handle_comment = \
343 handle_decl = handle_pi = unknown_decl = find_startpos
345 def get_result(self):
346 if self.result is None:
348 if len(self.result) != 3:
350 lines = self.html.split('\n')
351 lines = lines[self.result[1][0]-1:self.result[2][0]]
352 lines[0] = lines[0][self.result[1][1]:]
354 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
355 lines[-1] = lines[-1][:self.result[2][1]]
356 return '\n'.join(lines).strip()
357 # Hack for https://github.com/rg3/youtube-dl/issues/662
358 if sys.version_info < (2, 7, 3):
359 AttrParser.parse_endtag = (lambda self, i:
360 i + len("</scr'+'ipt>")
361 if self.rawdata[i:].startswith("</scr'+'ipt>")
362 else compat_html_parser.HTMLParser.parse_endtag(self, i))
364 def get_element_by_id(id, html):
365 """Return the content of the tag with the specified ID in the passed HTML document"""
366 return get_element_by_attribute("id", id, html)
368 def get_element_by_attribute(attribute, value, html):
369 """Return the content of the tag with the specified attribute in the passed HTML document"""
370 parser = AttrParser(attribute, value)
373 except compat_html_parser.HTMLParseError:
375 return parser.get_result()
377 class MetaParser(BaseHTMLParser):
379 Modified HTMLParser that isolates a meta tag with the specified name
382 def __init__(self, name):
383 BaseHTMLParser.__init__(self)
388 def handle_starttag(self, tag, attrs):
392 if attrs.get('name') == self.name:
393 self.result = attrs.get('content')
395 def get_result(self):
398 def get_meta_content(name, html):
400 Return the content attribute from the meta tag with the given name attribute.
402 parser = MetaParser(name)
405 except compat_html_parser.HTMLParseError:
407 return parser.get_result()
410 def clean_html(html):
411 """Clean an HTML snippet into a readable string"""
413 html = html.replace('\n', ' ')
414 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
415 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
417 html = re.sub('<.*?>', '', html)
418 # Replace html entities
419 html = unescapeHTML(html)
423 def sanitize_open(filename, open_mode):
424 """Try to open the given filename, and slightly tweak it if this fails.
426 Attempts to open the given filename. If this fails, it tries to change
427 the filename slightly, step by step, until it's either able to open it
428 or it fails and raises a final exception, like the standard open()
431 It returns the tuple (stream, definitive_file_name).
435 if sys.platform == 'win32':
437 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
438 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
439 stream = open(encodeFilename(filename), open_mode)
440 return (stream, filename)
441 except (IOError, OSError) as err:
442 if err.errno in (errno.EACCES,):
445 # In case of error, try to remove win32 forbidden chars
446 alt_filename = os.path.join(
447 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
448 for path_part in os.path.split(filename)
450 if alt_filename == filename:
453 # An exception here should be caught in the caller
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, alt_filename)
458 def timeconvert(timestr):
459 """Convert RFC 2822 defined time string into system timestamp"""
461 timetuple = email.utils.parsedate_tz(timestr)
462 if timetuple is not None:
463 timestamp = email.utils.mktime_tz(timetuple)
466 def sanitize_filename(s, restricted=False, is_id=False):
467 """Sanitizes a string so it could be used as part of a filename.
468 If restricted is set, use a stricter subset of allowed characters.
469 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
471 def replace_insane(char):
472 if char == '?' or ord(char) < 32 or ord(char) == 127:
475 return '' if restricted else '\''
477 return '_-' if restricted else ' -'
478 elif char in '\\/|*<>':
480 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
482 if restricted and ord(char) > 127:
486 result = u''.join(map(replace_insane, s))
488 while '__' in result:
489 result = result.replace('__', '_')
490 result = result.strip('_')
491 # Common case of "Foreign band name - English song title"
492 if restricted and result.startswith('-_'):
498 def orderedSet(iterable):
499 """ Remove all duplicates from the input iterable """
510 assert type(s) == compat_str
512 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
516 def encodeFilename(s, for_subprocess=False):
518 @param s The name of the file
521 assert type(s) == compat_str
523 # Python 3 has a Unicode API
524 if sys.version_info >= (3, 0):
527 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
528 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
529 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
530 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
531 if not for_subprocess:
534 # For subprocess calls, encode with locale encoding
535 # Refer to http://stackoverflow.com/a/9951851/35070
536 encoding = preferredencoding()
538 encoding = sys.getfilesystemencoding()
541 return s.encode(encoding, 'ignore')
543 def decodeOption(optval):
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
549 assert isinstance(optval, compat_str)
552 def formatSeconds(secs):
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
556 return '%d:%02d' % (secs // 60, secs % 60)
561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
562 if sys.version_info < (3, 2):
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
570 sock = socket.create_connection((self.host, self.port), self.timeout)
571 if getattr(self, '_tunnel_host', False):
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
582 return HTTPSHandlerV3(**kwargs)
584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
585 context.verify_mode = (ssl.CERT_NONE
586 if opts_no_check_certificate
587 else ssl.CERT_REQUIRED)
588 context.set_default_verify_paths()
590 context.load_default_certs()
591 except AttributeError:
593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
595 class ExtractorError(Exception):
596 """Error during info extraction."""
597 def __init__(self, msg, tb=None, expected=False, cause=None):
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
605 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
606 super(ExtractorError, self).__init__(msg)
609 self.exc_info = sys.exc_info() # preserve original exception
612 def format_traceback(self):
613 if self.traceback is None:
615 return u''.join(traceback.format_tb(self.traceback))
618 class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
623 class DownloadError(Exception):
624 """Download Error exception.
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
636 class SameFileError(Exception):
637 """Same File exception.
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
645 class PostProcessingError(Exception):
646 """Post Processing exception.
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
651 def __init__(self, msg):
654 class MaxDownloadsReached(Exception):
655 """ --max-downloads limit has been reached. """
659 class UnavailableVideoError(Exception):
660 """Unavailable Format exception.
662 This exception will be thrown when a video is requested
663 in a format that is not available for that video.
668 class ContentTooShortError(Exception):
669 """Content Too Short exception.
671 This exception may be raised by FileDownloader objects when a file they
672 download is too small for what the server announced first, indicating
673 the connection was probably interrupted.
679 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
684 """Handler for HTTP requests and responses.
686 This class, when installed with an OpenerDirector, automatically adds
687 the standard headers to every HTTP request and handles gzipped and
688 deflated responses from web servers. If compression is to be avoided in
689 a particular request, the original request in the program code only has
690 to include the HTTP header "Youtubedl-No-Compression", which will be
691 removed before making the real request.
693 Part of this code was copied from:
695 http://techknack.net/python-urllib2-handlers/
697 Andrew Rowls, the author of that code, agreed to release it to the
704 return zlib.decompress(data, -zlib.MAX_WBITS)
706 return zlib.decompress(data)
709 def addinfourl_wrapper(stream, headers, url, code):
710 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
711 return compat_urllib_request.addinfourl(stream, headers, url, code)
712 ret = compat_urllib_request.addinfourl(stream, headers, url)
716 def http_request(self, req):
717 for h,v in std_headers.items():
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
725 if 'Youtubedl-user-agent' in req.headers:
726 if 'User-agent' in req.headers:
727 del req.headers['User-agent']
728 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
729 del req.headers['Youtubedl-user-agent']
732 def http_response(self, req, resp):
735 if resp.headers.get('Content-encoding', '') == 'gzip':
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
753 resp.msg = old_resp.msg
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
761 https_request = http_request
762 https_response = http_response
765 def parse_iso8601(date_str):
766 """ Return a UNIX timestamp from the given date """
772 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
775 timezone = datetime.timedelta()
777 date_str = date_str[:-len(m.group(0))]
778 if not m.group('sign'):
779 timezone = datetime.timedelta()
781 sign = 1 if m.group('sign') == '+' else -1
782 timezone = datetime.timedelta(
783 hours=sign * int(m.group('hours')),
784 minutes=sign * int(m.group('minutes')))
786 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
787 return calendar.timegm(dt.timetuple())
790 def unified_strdate(date_str):
791 """Return a string with the date in the format YYYYMMDD"""
798 date_str = date_str.replace(',', ' ')
799 # %z (UTC offset) is only supported in python>=3.2
800 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
801 format_expressions = [
813 '%Y-%m-%dT%H:%M:%SZ',
814 '%Y-%m-%dT%H:%M:%S.%fZ',
815 '%Y-%m-%dT%H:%M:%S.%f0Z',
817 '%Y-%m-%dT%H:%M:%S.%f',
820 for expression in format_expressions:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
831 def determine_ext(url, default_ext=u'unknown_video'):
832 guess = url.partition(u'?')[0].rpartition(u'.')[2]
833 if re.match(r'^[A-Za-z0-9]+$', guess):
838 def subtitles_filename(filename, sub_lang, sub_format):
839 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
841 def date_from_str(date_str):
843 Return a datetime object from a string in the format YYYYMMDD or
844 (now|today)[+-][0-9](day|week|month|year)(s)?"""
845 today = datetime.date.today()
846 if date_str == 'now'or date_str == 'today':
848 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
849 if match is not None:
850 sign = match.group('sign')
851 time = int(match.group('time'))
854 unit = match.group('unit')
863 delta = datetime.timedelta(**{unit: time})
865 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
867 def hyphenate_date(date_str):
869 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
870 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
871 if match is not None:
872 return '-'.join(match.groups())
876 class DateRange(object):
877 """Represents a time interval between two dates"""
878 def __init__(self, start=None, end=None):
879 """start and end must be strings in the format accepted by date"""
880 if start is not None:
881 self.start = date_from_str(start)
883 self.start = datetime.datetime.min.date()
885 self.end = date_from_str(end)
887 self.end = datetime.datetime.max.date()
888 if self.start > self.end:
889 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
892 """Returns a range that only contains the given day"""
894 def __contains__(self, date):
895 """Check if the date is in the range"""
896 if not isinstance(date, datetime.date):
897 date = date_from_str(date)
898 return self.start <= date <= self.end
900 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
904 """ Returns the platform name as a compat_str """
905 res = platform.platform()
906 if isinstance(res, bytes):
907 res = res.decode(preferredencoding())
909 assert isinstance(res, compat_str)
913 def write_string(s, out=None, encoding=None):
916 assert type(s) == compat_str
918 if ('b' in getattr(out, 'mode', '') or
919 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
920 s = s.encode(encoding or preferredencoding(), 'ignore')
923 except UnicodeEncodeError:
924 # In Windows shells, this can fail even when the codec is just charmap!?
925 # See https://wiki.python.org/moin/PrintFails#Issue
926 if sys.platform == 'win32':
927 if not encoding and hasattr(out, 'encoding'):
928 encoding = out.encoding
930 b = s.encode(encoding, 'ignore').decode(encoding)
938 def bytes_to_intlist(bs):
941 if isinstance(bs[0], int): # Python 3
944 return [ord(c) for c in bs]
947 def intlist_to_bytes(xs):
950 if isinstance(chr(0), bytes): # Python 2
951 return ''.join([chr(x) for x in xs])
956 def get_cachedir(params={}):
957 cache_root = os.environ.get('XDG_CACHE_HOME',
958 os.path.expanduser('~/.cache'))
959 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
962 # Cross-platform file locking
963 if sys.platform == 'win32':
964 import ctypes.wintypes
967 class OVERLAPPED(ctypes.Structure):
969 ('Internal', ctypes.wintypes.LPVOID),
970 ('InternalHigh', ctypes.wintypes.LPVOID),
971 ('Offset', ctypes.wintypes.DWORD),
972 ('OffsetHigh', ctypes.wintypes.DWORD),
973 ('hEvent', ctypes.wintypes.HANDLE),
976 kernel32 = ctypes.windll.kernel32
977 LockFileEx = kernel32.LockFileEx
978 LockFileEx.argtypes = [
979 ctypes.wintypes.HANDLE, # hFile
980 ctypes.wintypes.DWORD, # dwFlags
981 ctypes.wintypes.DWORD, # dwReserved
982 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
983 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
984 ctypes.POINTER(OVERLAPPED) # Overlapped
986 LockFileEx.restype = ctypes.wintypes.BOOL
987 UnlockFileEx = kernel32.UnlockFileEx
988 UnlockFileEx.argtypes = [
989 ctypes.wintypes.HANDLE, # hFile
990 ctypes.wintypes.DWORD, # dwReserved
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
992 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
993 ctypes.POINTER(OVERLAPPED) # Overlapped
995 UnlockFileEx.restype = ctypes.wintypes.BOOL
996 whole_low = 0xffffffff
997 whole_high = 0x7fffffff
999 def _lock_file(f, exclusive):
1000 overlapped = OVERLAPPED()
1001 overlapped.Offset = 0
1002 overlapped.OffsetHigh = 0
1003 overlapped.hEvent = 0
1004 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1005 handle = msvcrt.get_osfhandle(f.fileno())
1006 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1007 whole_low, whole_high, f._lock_file_overlapped_p):
1008 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1010 def _unlock_file(f):
1011 assert f._lock_file_overlapped_p
1012 handle = msvcrt.get_osfhandle(f.fileno())
1013 if not UnlockFileEx(handle, 0,
1014 whole_low, whole_high, f._lock_file_overlapped_p):
1015 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1020 def _lock_file(f, exclusive):
1021 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1023 def _unlock_file(f):
1024 fcntl.lockf(f, fcntl.LOCK_UN)
1027 class locked_file(object):
1028 def __init__(self, filename, mode, encoding=None):
1029 assert mode in ['r', 'a', 'w']
1030 self.f = io.open(filename, mode, encoding=encoding)
1033 def __enter__(self):
1034 exclusive = self.mode != 'r'
1036 _lock_file(self.f, exclusive)
1042 def __exit__(self, etype, value, traceback):
1044 _unlock_file(self.f)
1051 def write(self, *args):
1052 return self.f.write(*args)
1054 def read(self, *args):
1055 return self.f.read(*args)
1058 def shell_quote(args):
1060 encoding = sys.getfilesystemencoding()
1061 if encoding is None:
1064 if isinstance(a, bytes):
1065 # We may get a filename encoded with 'encodeFilename'
1066 a = a.decode(encoding)
1067 quoted_args.append(pipes.quote(a))
1068 return u' '.join(quoted_args)
1071 def takewhile_inclusive(pred, seq):
1072 """ Like itertools.takewhile, but include the latest evaluated element
1073 (the first element so that Not pred(e)) """
1080 def smuggle_url(url, data):
1081 """ Pass additional data in a URL for internal use. """
1083 sdata = compat_urllib_parse.urlencode(
1084 {u'__youtubedl_smuggle': json.dumps(data)})
1085 return url + u'#' + sdata
1088 def unsmuggle_url(smug_url, default=None):
1089 if not '#__youtubedl_smuggle' in smug_url:
1090 return smug_url, default
1091 url, _, sdata = smug_url.rpartition(u'#')
1092 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1093 data = json.loads(jsond)
1097 def format_bytes(bytes):
1100 if type(bytes) is str:
1101 bytes = float(bytes)
1105 exponent = int(math.log(bytes, 1024.0))
1106 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1107 converted = float(bytes) / float(1024 ** exponent)
1108 return u'%.2f%s' % (converted, suffix)
1111 def str_to_int(int_str):
1112 int_str = re.sub(r'[,\.]', u'', int_str)
1116 def get_term_width():
1117 columns = os.environ.get('COLUMNS', None)
1122 sp = subprocess.Popen(
1124 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1125 out, err = sp.communicate()
1126 return int(out.split()[1])
1132 def month_by_name(name):
1133 """ Return the number of a month by (locale-independently) English name """
1136 u'January', u'February', u'March', u'April', u'May', u'June',
1137 u'July', u'August', u'September', u'October', u'November', u'December']
1139 return ENGLISH_NAMES.index(name) + 1
1144 def fix_xml_ampersands(xml_str):
1145 """Replace all the '&' by '&' in XML"""
1147 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1152 def setproctitle(title):
1153 assert isinstance(title, compat_str)
1155 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1158 title_bytes = title.encode('utf-8')
1159 buf = ctypes.create_string_buffer(len(title_bytes))
1160 buf.value = title_bytes
1162 libc.prctl(15, buf, 0, 0, 0)
1163 except AttributeError:
1164 return # Strange libc, just skip this
1167 def remove_start(s, start):
1168 if s.startswith(start):
1169 return s[len(start):]
1173 def url_basename(url):
1174 path = compat_urlparse.urlparse(url).path
1175 return path.strip(u'/').split(u'/')[-1]
1178 class HEADRequest(compat_urllib_request.Request):
1179 def get_method(self):
1183 def int_or_none(v, scale=1, default=None):
1184 return default if v is None else (int(v) // scale)
1187 def float_or_none(v, scale=1, default=None):
1188 return default if v is None else (float(v) / scale)
1191 def parse_duration(s):
1196 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1199 res = int(m.group('secs'))
1201 res += int(m.group('mins')) * 60
1202 if m.group('hours'):
1203 res += int(m.group('hours')) * 60 * 60
1207 def prepend_extension(filename, ext):
1208 name, real_ext = os.path.splitext(filename)
1209 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1212 def check_executable(exe, args=[]):
1213 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1214 args can be a list of arguments for a short output (like -version) """
1216 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1222 class PagedList(object):
1223 def __init__(self, pagefunc, pagesize):
1224 self._pagefunc = pagefunc
1225 self._pagesize = pagesize
1228 # This is only useful for tests
1229 return len(self.getslice())
1231 def getslice(self, start=0, end=None):
1233 for pagenum in itertools.count(start // self._pagesize):
1234 firstid = pagenum * self._pagesize
1235 nextfirstid = pagenum * self._pagesize + self._pagesize
1236 if start >= nextfirstid:
1239 page_results = list(self._pagefunc(pagenum))
1242 start % self._pagesize
1243 if firstid <= start < nextfirstid
1247 ((end - 1) % self._pagesize) + 1
1248 if (end is not None and firstid <= end <= nextfirstid)
1251 if startv != 0 or endv is not None:
1252 page_results = page_results[startv:endv]
1253 res.extend(page_results)
1255 # A little optimization - if current page is not "full", ie. does
1256 # not contain page_size videos then we can assume that this page
1257 # is the last one - there are no more ids on further pages -
1258 # i.e. no need to query again.
1259 if len(page_results) + startv < self._pagesize:
1262 # If we got the whole page, but the next page is not interesting,
1263 # break out early as well
1264 if end == nextfirstid:
1269 def uppercase_escape(s):
1270 unicode_escape = codecs.getdecoder('unicode_escape')
1272 r'\\U[0-9a-fA-F]{8}',
1273 lambda m: unicode_escape(m.group(0))[0],
1277 struct.pack(u'!I', 0)
1279 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1280 def struct_pack(spec, *args):
1281 if isinstance(spec, compat_str):
1282 spec = spec.encode('ascii')
1283 return struct.pack(spec, *args)
1285 def struct_unpack(spec, *args):
1286 if isinstance(spec, compat_str):
1287 spec = spec.encode('ascii')
1288 return struct.unpack(spec, *args)
1290 struct_pack = struct.pack
1291 struct_unpack = struct.unpack
1294 def read_batch_urls(batch_fd):
1296 if not isinstance(url, compat_str):
1297 url = url.decode('utf-8', 'replace')
1298 BOM_UTF8 = u'\xef\xbb\xbf'
1299 if url.startswith(BOM_UTF8):
1300 url = url[len(BOM_UTF8):]
1302 if url.startswith(('#', ';', ']')):
1306 with contextlib.closing(batch_fd) as fd:
1307 return [url for url in map(fixup, fd) if url]
1310 def urlencode_postdata(*args, **kargs):
1311 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1315 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1316 def doctype(self, name, pubid, system):
1317 pass # Ignore doctypes
1319 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1320 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1321 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1324 if sys.version_info < (3, 0) and sys.platform == 'win32':
1325 def compat_getpass(prompt, *args, **kwargs):
1326 if isinstance(prompt, compat_str):
1327 prompt = prompt.encode(preferredencoding())
1328 return getpass.getpass(prompt, *args, **kwargs)
1330 compat_getpass = getpass.getpass
1342 def strip_jsonp(code):
1343 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)