2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
32 import urllib.request as compat_urllib_request
33 except ImportError: # Python 2
34 import urllib2 as compat_urllib_request
37 import urllib.error as compat_urllib_error
38 except ImportError: # Python 2
39 import urllib2 as compat_urllib_error
42 import urllib.parse as compat_urllib_parse
43 except ImportError: # Python 2
44 import urllib as compat_urllib_parse
47 from urllib.parse import urlparse as compat_urllib_parse_urlparse
48 except ImportError: # Python 2
49 from urlparse import urlparse as compat_urllib_parse_urlparse
52 import urllib.parse as compat_urlparse
53 except ImportError: # Python 2
54 import urlparse as compat_urlparse
57 import http.cookiejar as compat_cookiejar
58 except ImportError: # Python 2
59 import cookielib as compat_cookiejar
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
67 import html.parser as compat_html_parser
68 except ImportError: # Python 2
69 import HTMLParser as compat_html_parser
72 import http.client as compat_http_client
73 except ImportError: # Python 2
74 import httplib as compat_http_client
77 from urllib.error import HTTPError as compat_HTTPError
78 except ImportError: # Python 2
79 from urllib2 import HTTPError as compat_HTTPError
82 from urllib.request import urlretrieve as compat_urlretrieve
83 except ImportError: # Python 2
84 from urllib import urlretrieve as compat_urlretrieve
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
94 from urllib.parse import parse_qs as compat_parse_qs
95 except ImportError: # Python 2
96 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
97 # Python 2's version is apparently totally broken
98 def _unquote(string, encoding='utf-8', errors='replace'):
101 res = string.split('%')
108 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
115 pct_sequence += item[:2].decode('hex')
118 # This segment was just a single percent-encoded character.
119 # May be part of a sequence of code units, so delay decoding.
120 # (Stored in pct_sequence).
124 # Encountered non-percent-encoded characters. Flush the current
126 string += pct_sequence.decode(encoding, errors) + rest
129 # Flush the final pct_sequence
130 string += pct_sequence.decode(encoding, errors)
133 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
134 encoding='utf-8', errors='replace'):
135 qs, _coerce_result = qs, unicode
136 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
138 for name_value in pairs:
139 if not name_value and not strict_parsing:
141 nv = name_value.split('=', 1)
144 raise ValueError("bad query field: %r" % (name_value,))
145 # Handle case of a control-name with no equal sign
146 if keep_blank_values:
150 if len(nv[1]) or keep_blank_values:
151 name = nv[0].replace('+', ' ')
152 name = _unquote(name, encoding=encoding, errors=errors)
153 name = _coerce_result(name)
154 value = nv[1].replace('+', ' ')
155 value = _unquote(value, encoding=encoding, errors=errors)
156 value = _coerce_result(value)
157 r.append((name, value))
160 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
161 encoding='utf-8', errors='replace'):
163 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
164 encoding=encoding, errors=errors)
165 for name, value in pairs:
166 if name in parsed_result:
167 parsed_result[name].append(value)
169 parsed_result[name] = [value]
173 compat_str = unicode # Python 2
178 compat_chr = unichr # Python 2
183 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
184 except ImportError: # Python 2.6
185 from xml.parsers.expat import ExpatError as compat_xml_parse_error
188 if type(c) is int: return c
191 # This is not clearly defined otherwise
192 compiled_regex_type = type(re.compile(''))
195 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
196 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
197 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
198 'Accept-Encoding': 'gzip, deflate',
199 'Accept-Language': 'en-us,en;q=0.5',
202 def preferredencoding():
203 """Get preferred encoding.
205 Returns the best encoding scheme for the system, based on
206 locale.getpreferredencoding() and some further tweaks.
209 pref = locale.getpreferredencoding()
216 if sys.version_info < (3,0):
218 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
221 assert type(s) == type(u'')
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3,0):
227 def write_json_file(obj, fn):
228 with open(fn, 'wb') as f:
231 def write_json_file(obj, fn):
232 with open(fn, 'w', encoding='utf-8') as f:
235 if sys.version_info >= (2,7):
236 def find_xpath_attr(node, xpath, key, val):
237 """ Find the xpath xpath[@key=val] """
238 assert re.match(r'^[a-zA-Z]+$', key)
239 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
240 expr = xpath + u"[@%s='%s']" % (key, val)
241 return node.find(expr)
243 def find_xpath_attr(node, xpath, key, val):
244 for f in node.findall(xpath):
245 if f.attrib.get(key) == val:
249 # On python2.6 the xml.etree.ElementTree.Element methods don't support
250 # the namespace parameter
251 def xpath_with_ns(path, ns_map):
252 components = [c.split(':') for c in path.split('/')]
256 replaced.append(c[0])
259 replaced.append('{%s}%s' % (ns_map[ns], tag))
260 return '/'.join(replaced)
262 def htmlentity_transform(matchobj):
263 """Transforms an HTML entity to a character.
265 This function receives a match object and is intended to be used with
266 the re.sub() function.
268 entity = matchobj.group(1)
270 # Known non-numeric HTML entity
271 if entity in compat_html_entities.name2codepoint:
272 return compat_chr(compat_html_entities.name2codepoint[entity])
274 mobj = re.match(u'(?u)#(x?\\d+)', entity)
276 numstr = mobj.group(1)
277 if numstr.startswith(u'x'):
279 numstr = u'0%s' % numstr
282 return compat_chr(int(numstr, base))
284 # Unknown entity in name, return its literal representation
285 return (u'&%s;' % entity)
287 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
288 class BaseHTMLParser(compat_html_parser.HTMLParser):
290 compat_html_parser.HTMLParser.__init__(self)
293 def loads(self, html):
298 class AttrParser(BaseHTMLParser):
299 """Modified HTMLParser that isolates a tag with the specified attribute"""
300 def __init__(self, attribute, value):
301 self.attribute = attribute
306 self.watch_startpos = False
308 BaseHTMLParser.__init__(self)
310 def error(self, message):
311 if self.error_count > 10 or self.started:
312 raise compat_html_parser.HTMLParseError(message, self.getpos())
313 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
314 self.error_count += 1
317 def handle_starttag(self, tag, attrs):
320 self.find_startpos(None)
321 if self.attribute in attrs and attrs[self.attribute] == self.value:
324 self.watch_startpos = True
326 if not tag in self.depth: self.depth[tag] = 0
329 def handle_endtag(self, tag):
331 if tag in self.depth: self.depth[tag] -= 1
332 if self.depth[self.result[0]] == 0:
334 self.result.append(self.getpos())
336 def find_startpos(self, x):
337 """Needed to put the start position of the result (self.result[1])
338 after the opening tag with the requested id"""
339 if self.watch_startpos:
340 self.watch_startpos = False
341 self.result.append(self.getpos())
342 handle_entityref = handle_charref = handle_data = handle_comment = \
343 handle_decl = handle_pi = unknown_decl = find_startpos
345 def get_result(self):
346 if self.result is None:
348 if len(self.result) != 3:
350 lines = self.html.split('\n')
351 lines = lines[self.result[1][0]-1:self.result[2][0]]
352 lines[0] = lines[0][self.result[1][1]:]
354 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
355 lines[-1] = lines[-1][:self.result[2][1]]
356 return '\n'.join(lines).strip()
357 # Hack for https://github.com/rg3/youtube-dl/issues/662
358 if sys.version_info < (2, 7, 3):
359 AttrParser.parse_endtag = (lambda self, i:
360 i + len("</scr'+'ipt>")
361 if self.rawdata[i:].startswith("</scr'+'ipt>")
362 else compat_html_parser.HTMLParser.parse_endtag(self, i))
364 def get_element_by_id(id, html):
365 """Return the content of the tag with the specified ID in the passed HTML document"""
366 return get_element_by_attribute("id", id, html)
368 def get_element_by_attribute(attribute, value, html):
369 """Return the content of the tag with the specified attribute in the passed HTML document"""
370 parser = AttrParser(attribute, value)
373 except compat_html_parser.HTMLParseError:
375 return parser.get_result()
377 class MetaParser(BaseHTMLParser):
379 Modified HTMLParser that isolates a meta tag with the specified name
382 def __init__(self, name):
383 BaseHTMLParser.__init__(self)
388 def handle_starttag(self, tag, attrs):
392 if attrs.get('name') == self.name:
393 self.result = attrs.get('content')
395 def get_result(self):
398 def get_meta_content(name, html):
400 Return the content attribute from the meta tag with the given name attribute.
402 parser = MetaParser(name)
405 except compat_html_parser.HTMLParseError:
407 return parser.get_result()
410 def clean_html(html):
411 """Clean an HTML snippet into a readable string"""
413 html = html.replace('\n', ' ')
414 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
415 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
417 html = re.sub('<.*?>', '', html)
418 # Replace html entities
419 html = unescapeHTML(html)
423 def sanitize_open(filename, open_mode):
424 """Try to open the given filename, and slightly tweak it if this fails.
426 Attempts to open the given filename. If this fails, it tries to change
427 the filename slightly, step by step, until it's either able to open it
428 or it fails and raises a final exception, like the standard open()
431 It returns the tuple (stream, definitive_file_name).
435 if sys.platform == 'win32':
437 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
438 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
439 stream = open(encodeFilename(filename), open_mode)
440 return (stream, filename)
441 except (IOError, OSError) as err:
442 if err.errno in (errno.EACCES,):
445 # In case of error, try to remove win32 forbidden chars
446 alt_filename = os.path.join(
447 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
448 for path_part in os.path.split(filename)
450 if alt_filename == filename:
453 # An exception here should be caught in the caller
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, alt_filename)
458 def timeconvert(timestr):
459 """Convert RFC 2822 defined time string into system timestamp"""
461 timetuple = email.utils.parsedate_tz(timestr)
462 if timetuple is not None:
463 timestamp = email.utils.mktime_tz(timetuple)
466 def sanitize_filename(s, restricted=False, is_id=False):
467 """Sanitizes a string so it could be used as part of a filename.
468 If restricted is set, use a stricter subset of allowed characters.
469 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
471 def replace_insane(char):
472 if char == '?' or ord(char) < 32 or ord(char) == 127:
475 return '' if restricted else '\''
477 return '_-' if restricted else ' -'
478 elif char in '\\/|*<>':
480 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
482 if restricted and ord(char) > 127:
486 result = u''.join(map(replace_insane, s))
488 while '__' in result:
489 result = result.replace('__', '_')
490 result = result.strip('_')
491 # Common case of "Foreign band name - English song title"
492 if restricted and result.startswith('-_'):
498 def orderedSet(iterable):
499 """ Remove all duplicates from the input iterable """
510 assert type(s) == compat_str
512 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
516 def encodeFilename(s, for_subprocess=False):
518 @param s The name of the file
521 assert type(s) == compat_str
523 # Python 3 has a Unicode API
524 if sys.version_info >= (3, 0):
527 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
528 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
529 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
530 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
531 if not for_subprocess:
534 # For subprocess calls, encode with locale encoding
535 # Refer to http://stackoverflow.com/a/9951851/35070
536 encoding = preferredencoding()
538 encoding = sys.getfilesystemencoding()
541 return s.encode(encoding, 'ignore')
543 def decodeOption(optval):
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
549 assert isinstance(optval, compat_str)
552 def formatSeconds(secs):
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
556 return '%d:%02d' % (secs // 60, secs % 60)
561 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
562 if sys.version_info < (3, 2):
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
570 sock = socket.create_connection((self.host, self.port), self.timeout)
571 if getattr(self, '_tunnel_host', False):
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
582 return HTTPSHandlerV3(**kwargs)
584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
585 context.verify_mode = (ssl.CERT_NONE
586 if opts_no_check_certificate
587 else ssl.CERT_REQUIRED)
588 context.set_default_verify_paths()
590 context.load_default_certs()
591 except AttributeError:
593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
595 class ExtractorError(Exception):
596 """Error during info extraction."""
597 def __init__(self, msg, tb=None, expected=False, cause=None):
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
605 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
606 super(ExtractorError, self).__init__(msg)
609 self.exc_info = sys.exc_info() # preserve original exception
612 def format_traceback(self):
613 if self.traceback is None:
615 return u''.join(traceback.format_tb(self.traceback))
618 class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
623 class DownloadError(Exception):
624 """Download Error exception.
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
636 class SameFileError(Exception):
637 """Same File exception.
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
645 class PostProcessingError(Exception):
646 """Post Processing exception.
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
651 def __init__(self, msg):
654 class MaxDownloadsReached(Exception):
655 """ --max-downloads limit has been reached. """
659 class UnavailableVideoError(Exception):
660 """Unavailable Format exception.
662 This exception will be thrown when a video is requested
663 in a format that is not available for that video.
668 class ContentTooShortError(Exception):
669 """Content Too Short exception.
671 This exception may be raised by FileDownloader objects when a file they
672 download is too small for what the server announced first, indicating
673 the connection was probably interrupted.
679 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
683 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
684 """Handler for HTTP requests and responses.
686 This class, when installed with an OpenerDirector, automatically adds
687 the standard headers to every HTTP request and handles gzipped and
688 deflated responses from web servers. If compression is to be avoided in
689 a particular request, the original request in the program code only has
690 to include the HTTP header "Youtubedl-No-Compression", which will be
691 removed before making the real request.
693 Part of this code was copied from:
695 http://techknack.net/python-urllib2-handlers/
697 Andrew Rowls, the author of that code, agreed to release it to the
704 return zlib.decompress(data, -zlib.MAX_WBITS)
706 return zlib.decompress(data)
709 def addinfourl_wrapper(stream, headers, url, code):
710 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
711 return compat_urllib_request.addinfourl(stream, headers, url, code)
712 ret = compat_urllib_request.addinfourl(stream, headers, url)
716 def http_request(self, req):
717 for h,v in std_headers.items():
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
725 if 'Youtubedl-user-agent' in req.headers:
726 if 'User-agent' in req.headers:
727 del req.headers['User-agent']
728 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
729 del req.headers['Youtubedl-user-agent']
732 def http_response(self, req, resp):
735 if resp.headers.get('Content-encoding', '') == 'gzip':
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
753 resp.msg = old_resp.msg
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
761 https_request = http_request
762 https_response = http_response
765 def parse_iso8601(date_str):
766 """ Return a UNIX timestamp from the given date """
772 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
775 timezone = datetime.timedelta()
777 date_str = date_str[:-len(m.group(0))]
778 if not m.group('sign'):
779 timezone = datetime.timedelta()
781 sign = 1 if m.group('sign') == '+' else -1
782 timezone = datetime.timedelta(
783 hours=sign * int(m.group('hours')),
784 minutes=sign * int(m.group('minutes')))
786 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
787 return calendar.timegm(dt.timetuple())
790 def unified_strdate(date_str):
791 """Return a string with the date in the format YYYYMMDD"""
798 date_str = date_str.replace(',', ' ')
799 # %z (UTC offset) is only supported in python>=3.2
800 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
801 format_expressions = [
813 '%Y-%m-%dT%H:%M:%SZ',
814 '%Y-%m-%dT%H:%M:%S.%fZ',
815 '%Y-%m-%dT%H:%M:%S.%f0Z',
817 '%Y-%m-%dT%H:%M:%S.%f',
820 for expression in format_expressions:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
831 def determine_ext(url, default_ext=u'unknown_video'):
832 guess = url.partition(u'?')[0].rpartition(u'.')[2]
833 if re.match(r'^[A-Za-z0-9]+$', guess):
838 def subtitles_filename(filename, sub_lang, sub_format):
839 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
841 def date_from_str(date_str):
843 Return a datetime object from a string in the format YYYYMMDD or
844 (now|today)[+-][0-9](day|week|month|year)(s)?"""
845 today = datetime.date.today()
846 if date_str == 'now'or date_str == 'today':
848 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
849 if match is not None:
850 sign = match.group('sign')
851 time = int(match.group('time'))
854 unit = match.group('unit')
863 delta = datetime.timedelta(**{unit: time})
865 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
867 def hyphenate_date(date_str):
869 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
870 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
871 if match is not None:
872 return '-'.join(match.groups())
876 class DateRange(object):
877 """Represents a time interval between two dates"""
878 def __init__(self, start=None, end=None):
879 """start and end must be strings in the format accepted by date"""
880 if start is not None:
881 self.start = date_from_str(start)
883 self.start = datetime.datetime.min.date()
885 self.end = date_from_str(end)
887 self.end = datetime.datetime.max.date()
888 if self.start > self.end:
889 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
892 """Returns a range that only contains the given day"""
894 def __contains__(self, date):
895 """Check if the date is in the range"""
896 if not isinstance(date, datetime.date):
897 date = date_from_str(date)
898 return self.start <= date <= self.end
900 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
904 """ Returns the platform name as a compat_str """
905 res = platform.platform()
906 if isinstance(res, bytes):
907 res = res.decode(preferredencoding())
909 assert isinstance(res, compat_str)
913 def _windows_write_string(s, out):
914 """ Returns True if the string was written using special methods,
915 False if it has yet to be written out."""
916 # Adapted from http://stackoverflow.com/a/3259271/35070
919 import ctypes.wintypes
926 fileno = out.fileno()
927 if fileno not in WIN_OUTPUT_IDS:
930 GetStdHandle = ctypes.WINFUNCTYPE(
931 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
932 ("GetStdHandle", ctypes.windll.kernel32))
933 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
935 WriteConsoleW = ctypes.WINFUNCTYPE(
936 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
937 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
938 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
939 written = ctypes.wintypes.DWORD(0)
941 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
942 FILE_TYPE_CHAR = 0x0002
943 FILE_TYPE_REMOTE = 0x8000
944 GetConsoleMode = ctypes.WINFUNCTYPE(
945 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
946 ctypes.POINTER(ctypes.wintypes.DWORD))(
947 ("GetConsoleMode", ctypes.windll.kernel32))
948 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
950 def not_a_console(handle):
951 if handle == INVALID_HANDLE_VALUE or handle is None:
953 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
954 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
959 def next_nonbmp_pos(s):
961 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
962 except StopIteration:
966 count = min(next_nonbmp_pos(s), 1024)
969 h, s, count if count else 2, ctypes.byref(written), None)
971 raise OSError('Failed to write string')
972 if not count: # We just wrote a non-BMP character
973 assert written.value == 2
976 assert written.value > 0
977 s = s[written.value:]
981 def write_string(s, out=None, encoding=None):
984 assert type(s) == compat_str
986 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
987 if _windows_write_string(s, out):
990 if ('b' in getattr(out, 'mode', '') or
991 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
992 byt = s.encode(encoding or preferredencoding(), 'ignore')
994 elif hasattr(out, 'buffer'):
995 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
996 byt = s.encode(enc, 'ignore')
997 out.buffer.write(byt)
1003 def bytes_to_intlist(bs):
1006 if isinstance(bs[0], int): # Python 3
1009 return [ord(c) for c in bs]
1012 def intlist_to_bytes(xs):
1015 if isinstance(chr(0), bytes): # Python 2
1016 return ''.join([chr(x) for x in xs])
1021 def get_cachedir(params={}):
1022 cache_root = os.environ.get('XDG_CACHE_HOME',
1023 os.path.expanduser('~/.cache'))
1024 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1027 # Cross-platform file locking
1028 if sys.platform == 'win32':
1029 import ctypes.wintypes
1032 class OVERLAPPED(ctypes.Structure):
1034 ('Internal', ctypes.wintypes.LPVOID),
1035 ('InternalHigh', ctypes.wintypes.LPVOID),
1036 ('Offset', ctypes.wintypes.DWORD),
1037 ('OffsetHigh', ctypes.wintypes.DWORD),
1038 ('hEvent', ctypes.wintypes.HANDLE),
1041 kernel32 = ctypes.windll.kernel32
1042 LockFileEx = kernel32.LockFileEx
1043 LockFileEx.argtypes = [
1044 ctypes.wintypes.HANDLE, # hFile
1045 ctypes.wintypes.DWORD, # dwFlags
1046 ctypes.wintypes.DWORD, # dwReserved
1047 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1048 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1049 ctypes.POINTER(OVERLAPPED) # Overlapped
1051 LockFileEx.restype = ctypes.wintypes.BOOL
1052 UnlockFileEx = kernel32.UnlockFileEx
1053 UnlockFileEx.argtypes = [
1054 ctypes.wintypes.HANDLE, # hFile
1055 ctypes.wintypes.DWORD, # dwReserved
1056 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1057 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1058 ctypes.POINTER(OVERLAPPED) # Overlapped
1060 UnlockFileEx.restype = ctypes.wintypes.BOOL
1061 whole_low = 0xffffffff
1062 whole_high = 0x7fffffff
1064 def _lock_file(f, exclusive):
1065 overlapped = OVERLAPPED()
1066 overlapped.Offset = 0
1067 overlapped.OffsetHigh = 0
1068 overlapped.hEvent = 0
1069 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1070 handle = msvcrt.get_osfhandle(f.fileno())
1071 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1072 whole_low, whole_high, f._lock_file_overlapped_p):
1073 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1075 def _unlock_file(f):
1076 assert f._lock_file_overlapped_p
1077 handle = msvcrt.get_osfhandle(f.fileno())
1078 if not UnlockFileEx(handle, 0,
1079 whole_low, whole_high, f._lock_file_overlapped_p):
1080 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1085 def _lock_file(f, exclusive):
1086 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1088 def _unlock_file(f):
1089 fcntl.lockf(f, fcntl.LOCK_UN)
1092 class locked_file(object):
1093 def __init__(self, filename, mode, encoding=None):
1094 assert mode in ['r', 'a', 'w']
1095 self.f = io.open(filename, mode, encoding=encoding)
1098 def __enter__(self):
1099 exclusive = self.mode != 'r'
1101 _lock_file(self.f, exclusive)
1107 def __exit__(self, etype, value, traceback):
1109 _unlock_file(self.f)
1116 def write(self, *args):
1117 return self.f.write(*args)
1119 def read(self, *args):
1120 return self.f.read(*args)
1123 def shell_quote(args):
1125 encoding = sys.getfilesystemencoding()
1126 if encoding is None:
1129 if isinstance(a, bytes):
1130 # We may get a filename encoded with 'encodeFilename'
1131 a = a.decode(encoding)
1132 quoted_args.append(pipes.quote(a))
1133 return u' '.join(quoted_args)
1136 def takewhile_inclusive(pred, seq):
1137 """ Like itertools.takewhile, but include the latest evaluated element
1138 (the first element so that Not pred(e)) """
1145 def smuggle_url(url, data):
1146 """ Pass additional data in a URL for internal use. """
1148 sdata = compat_urllib_parse.urlencode(
1149 {u'__youtubedl_smuggle': json.dumps(data)})
1150 return url + u'#' + sdata
1153 def unsmuggle_url(smug_url, default=None):
1154 if not '#__youtubedl_smuggle' in smug_url:
1155 return smug_url, default
1156 url, _, sdata = smug_url.rpartition(u'#')
1157 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1158 data = json.loads(jsond)
1162 def format_bytes(bytes):
1165 if type(bytes) is str:
1166 bytes = float(bytes)
1170 exponent = int(math.log(bytes, 1024.0))
1171 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1172 converted = float(bytes) / float(1024 ** exponent)
1173 return u'%.2f%s' % (converted, suffix)
1176 def str_to_int(int_str):
1177 int_str = re.sub(r'[,\.]', u'', int_str)
1181 def get_term_width():
1182 columns = os.environ.get('COLUMNS', None)
1187 sp = subprocess.Popen(
1189 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1190 out, err = sp.communicate()
1191 return int(out.split()[1])
1197 def month_by_name(name):
1198 """ Return the number of a month by (locale-independently) English name """
1201 u'January', u'February', u'March', u'April', u'May', u'June',
1202 u'July', u'August', u'September', u'October', u'November', u'December']
1204 return ENGLISH_NAMES.index(name) + 1
1209 def fix_xml_ampersands(xml_str):
1210 """Replace all the '&' by '&' in XML"""
1212 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1217 def setproctitle(title):
1218 assert isinstance(title, compat_str)
1220 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1223 title_bytes = title.encode('utf-8')
1224 buf = ctypes.create_string_buffer(len(title_bytes))
1225 buf.value = title_bytes
1227 libc.prctl(15, buf, 0, 0, 0)
1228 except AttributeError:
1229 return # Strange libc, just skip this
1232 def remove_start(s, start):
1233 if s.startswith(start):
1234 return s[len(start):]
1238 def url_basename(url):
1239 path = compat_urlparse.urlparse(url).path
1240 return path.strip(u'/').split(u'/')[-1]
1243 class HEADRequest(compat_urllib_request.Request):
1244 def get_method(self):
1248 def int_or_none(v, scale=1, default=None):
1249 return default if v is None else (int(v) // scale)
1252 def float_or_none(v, scale=1, default=None):
1253 return default if v is None else (float(v) / scale)
1256 def parse_duration(s):
1261 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
1264 res = int(m.group('secs'))
1266 res += int(m.group('mins')) * 60
1267 if m.group('hours'):
1268 res += int(m.group('hours')) * 60 * 60
1272 def prepend_extension(filename, ext):
1273 name, real_ext = os.path.splitext(filename)
1274 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1277 def check_executable(exe, args=[]):
1278 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1279 args can be a list of arguments for a short output (like -version) """
1281 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1287 class PagedList(object):
1288 def __init__(self, pagefunc, pagesize):
1289 self._pagefunc = pagefunc
1290 self._pagesize = pagesize
1293 # This is only useful for tests
1294 return len(self.getslice())
1296 def getslice(self, start=0, end=None):
1298 for pagenum in itertools.count(start // self._pagesize):
1299 firstid = pagenum * self._pagesize
1300 nextfirstid = pagenum * self._pagesize + self._pagesize
1301 if start >= nextfirstid:
1304 page_results = list(self._pagefunc(pagenum))
1307 start % self._pagesize
1308 if firstid <= start < nextfirstid
1312 ((end - 1) % self._pagesize) + 1
1313 if (end is not None and firstid <= end <= nextfirstid)
1316 if startv != 0 or endv is not None:
1317 page_results = page_results[startv:endv]
1318 res.extend(page_results)
1320 # A little optimization - if current page is not "full", ie. does
1321 # not contain page_size videos then we can assume that this page
1322 # is the last one - there are no more ids on further pages -
1323 # i.e. no need to query again.
1324 if len(page_results) + startv < self._pagesize:
1327 # If we got the whole page, but the next page is not interesting,
1328 # break out early as well
1329 if end == nextfirstid:
1334 def uppercase_escape(s):
1335 unicode_escape = codecs.getdecoder('unicode_escape')
1337 r'\\U[0-9a-fA-F]{8}',
1338 lambda m: unicode_escape(m.group(0))[0],
1342 struct.pack(u'!I', 0)
1344 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1345 def struct_pack(spec, *args):
1346 if isinstance(spec, compat_str):
1347 spec = spec.encode('ascii')
1348 return struct.pack(spec, *args)
1350 def struct_unpack(spec, *args):
1351 if isinstance(spec, compat_str):
1352 spec = spec.encode('ascii')
1353 return struct.unpack(spec, *args)
1355 struct_pack = struct.pack
1356 struct_unpack = struct.unpack
1359 def read_batch_urls(batch_fd):
1361 if not isinstance(url, compat_str):
1362 url = url.decode('utf-8', 'replace')
1363 BOM_UTF8 = u'\xef\xbb\xbf'
1364 if url.startswith(BOM_UTF8):
1365 url = url[len(BOM_UTF8):]
1367 if url.startswith(('#', ';', ']')):
1371 with contextlib.closing(batch_fd) as fd:
1372 return [url for url in map(fixup, fd) if url]
1375 def urlencode_postdata(*args, **kargs):
1376 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1380 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1381 def doctype(self, name, pubid, system):
1382 pass # Ignore doctypes
1384 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1385 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1386 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1389 if sys.version_info < (3, 0) and sys.platform == 'win32':
1390 def compat_getpass(prompt, *args, **kwargs):
1391 if isinstance(prompt, compat_str):
1392 prompt = prompt.encode(preferredencoding())
1393 return getpass.getpass(prompt, *args, **kwargs)
1395 compat_getpass = getpass.getpass
1407 def strip_jsonp(code):
1408 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)
1411 def qualities(quality_ids):
1412 """ Get a numeric quality value out of a list of possible values """
1415 return quality_ids.index(qid)