2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
293 # On python2.6 the xml.etree.ElementTree.Element methods don't support
294 # the namespace parameter
295 def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
300 replaced.append(c[0])
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
307 def xpath_text(node, xpath, name=None, fatal=False):
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
321 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
322 class BaseHTMLParser(compat_html_parser.HTMLParser):
324 compat_html_parser.HTMLParser.__init__(self)
327 def loads(self, html):
332 class AttrParser(BaseHTMLParser):
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
340 self.watch_startpos = False
342 BaseHTMLParser.__init__(self)
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
351 def handle_starttag(self, tag, attrs):
354 self.find_startpos(None)
355 if self.attribute in attrs and attrs[self.attribute] == self.value:
358 self.watch_startpos = True
360 if not tag in self.depth: self.depth[tag] = 0
363 def handle_endtag(self, tag):
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
368 self.result.append(self.getpos())
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
379 def get_result(self):
380 if self.result is None:
382 if len(self.result) != 3:
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
391 # Hack for https://github.com/rg3/youtube-dl/issues/662
392 if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
398 def get_element_by_id(id, html):
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
402 def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
407 except compat_html_parser.HTMLParseError:
409 return parser.get_result()
411 class MetaParser(BaseHTMLParser):
413 Modified HTMLParser that isolates a meta tag with the specified name
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
422 def handle_starttag(self, tag, attrs):
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
429 def get_result(self):
432 def get_meta_content(name, html):
434 Return the content attribute from the meta tag with the given name attribute.
436 parser = MetaParser(name)
439 except compat_html_parser.HTMLParseError:
441 return parser.get_result()
444 def clean_html(html):
445 """Clean an HTML snippet into a readable string"""
447 html = html.replace('\n', ' ')
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
457 def sanitize_open(filename, open_mode):
458 """Try to open the given filename, and slightly tweak it if this fails.
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
465 It returns the tuple (stream, definitive_file_name).
469 if sys.platform == 'win32':
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
476 if err.errno in (errno.EACCES,):
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
484 if alt_filename == filename:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
492 def timeconvert(timestr):
493 """Convert RFC 2822 defined time string into system timestamp"""
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
500 def sanitize_filename(s, restricted=False, is_id=False):
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
509 return '' if restricted else '\''
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
516 if restricted and ord(char) > 127:
520 result = u''.join(map(replace_insane, s))
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
532 def orderedSet(iterable):
533 """ Remove all duplicates from the input iterable """
541 def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
547 mobj = re.match(r'#(x?[0-9]+)', entity)
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
552 numstr = u'0%s' % numstr
555 return compat_chr(int(numstr, base))
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
564 assert type(s) == compat_str
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
570 def encodeFilename(s, for_subprocess=False):
572 @param s The name of the file
575 assert type(s) == compat_str
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
585 if not for_subprocess:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
592 encoding = sys.getfilesystemencoding()
595 return s.encode(encoding, 'ignore')
598 def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
607 def decodeOption(optval):
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
613 assert isinstance(optval, compat_str)
616 def formatSeconds(secs):
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
620 return '%d:%02d' % (secs // 60, secs % 60)
625 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
626 if sys.version_info < (3, 2):
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
634 sock = socket.create_connection((self.host, self.port), self.timeout)
635 if getattr(self, '_tunnel_host', False):
639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
646 return HTTPSHandlerV3(**kwargs)
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
655 context.verify_mode = (ssl.CERT_NONE
656 if opts_no_check_certificate
657 else ssl.CERT_REQUIRED)
658 context.set_default_verify_paths()
660 context.load_default_certs()
661 except AttributeError:
663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
665 class ExtractorError(Exception):
666 """Error during info extraction."""
667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
674 if video_id is not None:
675 msg = video_id + ': ' + msg
677 msg += u' (caused by %r)' % cause
679 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
680 super(ExtractorError, self).__init__(msg)
683 self.exc_info = sys.exc_info() # preserve original exception
685 self.video_id = video_id
687 def format_traceback(self):
688 if self.traceback is None:
690 return u''.join(traceback.format_tb(self.traceback))
693 class RegexNotFoundError(ExtractorError):
694 """Error when a regex didn't match"""
698 class DownloadError(Exception):
699 """Download Error exception.
701 This exception may be thrown by FileDownloader objects if they are not
702 configured to continue on errors. They will contain the appropriate
705 def __init__(self, msg, exc_info=None):
706 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
707 super(DownloadError, self).__init__(msg)
708 self.exc_info = exc_info
711 class SameFileError(Exception):
712 """Same File exception.
714 This exception will be thrown by FileDownloader objects if they detect
715 multiple files would have to be downloaded to the same file on disk.
720 class PostProcessingError(Exception):
721 """Post Processing exception.
723 This exception may be raised by PostProcessor's .run() method to
724 indicate an error in the postprocessing task.
726 def __init__(self, msg):
729 class MaxDownloadsReached(Exception):
730 """ --max-downloads limit has been reached. """
734 class UnavailableVideoError(Exception):
735 """Unavailable Format exception.
737 This exception will be thrown when a video is requested
738 in a format that is not available for that video.
743 class ContentTooShortError(Exception):
744 """Content Too Short exception.
746 This exception may be raised by FileDownloader objects when a file they
747 download is too small for what the server announced first, indicating
748 the connection was probably interrupted.
754 def __init__(self, downloaded, expected):
755 self.downloaded = downloaded
756 self.expected = expected
758 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
759 """Handler for HTTP requests and responses.
761 This class, when installed with an OpenerDirector, automatically adds
762 the standard headers to every HTTP request and handles gzipped and
763 deflated responses from web servers. If compression is to be avoided in
764 a particular request, the original request in the program code only has
765 to include the HTTP header "Youtubedl-No-Compression", which will be
766 removed before making the real request.
768 Part of this code was copied from:
770 http://techknack.net/python-urllib2-handlers/
772 Andrew Rowls, the author of that code, agreed to release it to the
779 return zlib.decompress(data, -zlib.MAX_WBITS)
781 return zlib.decompress(data)
784 def addinfourl_wrapper(stream, headers, url, code):
785 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
786 return compat_urllib_request.addinfourl(stream, headers, url, code)
787 ret = compat_urllib_request.addinfourl(stream, headers, url)
791 def http_request(self, req):
792 for h, v in std_headers.items():
793 if h not in req.headers:
795 if 'Youtubedl-no-compression' in req.headers:
796 if 'Accept-encoding' in req.headers:
797 del req.headers['Accept-encoding']
798 del req.headers['Youtubedl-no-compression']
799 if 'Youtubedl-user-agent' in req.headers:
800 if 'User-agent' in req.headers:
801 del req.headers['User-agent']
802 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
803 del req.headers['Youtubedl-user-agent']
805 if sys.version_info < (2, 7) and '#' in req.get_full_url():
806 # Python 2.6 is brain-dead when it comes to fragments
807 req._Request__original = req._Request__original.partition('#')[0]
808 req._Request__r_type = req._Request__r_type.partition('#')[0]
812 def http_response(self, req, resp):
815 if resp.headers.get('Content-encoding', '') == 'gzip':
816 content = resp.read()
817 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
819 uncompressed = io.BytesIO(gz.read())
820 except IOError as original_ioerror:
821 # There may be junk add the end of the file
822 # See http://stackoverflow.com/q/4928560/35070 for details
823 for i in range(1, 1024):
825 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
826 uncompressed = io.BytesIO(gz.read())
831 raise original_ioerror
832 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
833 resp.msg = old_resp.msg
835 if resp.headers.get('Content-encoding', '') == 'deflate':
836 gz = io.BytesIO(self.deflate(resp.read()))
837 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
838 resp.msg = old_resp.msg
841 https_request = http_request
842 https_response = http_response
845 def parse_iso8601(date_str, delimiter='T'):
846 """ Return a UNIX timestamp from the given date """
852 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
855 timezone = datetime.timedelta()
857 date_str = date_str[:-len(m.group(0))]
858 if not m.group('sign'):
859 timezone = datetime.timedelta()
861 sign = 1 if m.group('sign') == '+' else -1
862 timezone = datetime.timedelta(
863 hours=sign * int(m.group('hours')),
864 minutes=sign * int(m.group('minutes')))
865 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
866 dt = datetime.datetime.strptime(date_str, date_format) - timezone
867 return calendar.timegm(dt.timetuple())
870 def unified_strdate(date_str):
871 """Return a string with the date in the format YYYYMMDD"""
878 date_str = date_str.replace(',', ' ')
879 # %z (UTC offset) is only supported in python>=3.2
880 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
881 format_expressions = [
886 '%b %dst %Y %I:%M%p',
887 '%b %dnd %Y %I:%M%p',
888 '%b %dth %Y %I:%M%p',
897 '%Y-%m-%d %H:%M:%S.%f',
900 '%Y-%m-%dT%H:%M:%SZ',
901 '%Y-%m-%dT%H:%M:%S.%fZ',
902 '%Y-%m-%dT%H:%M:%S.%f0Z',
904 '%Y-%m-%dT%H:%M:%S.%f',
907 for expression in format_expressions:
909 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
912 if upload_date is None:
913 timetuple = email.utils.parsedate_tz(date_str)
915 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
918 def determine_ext(url, default_ext=u'unknown_video'):
921 guess = url.partition(u'?')[0].rpartition(u'.')[2]
922 if re.match(r'^[A-Za-z0-9]+$', guess):
927 def subtitles_filename(filename, sub_lang, sub_format):
928 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
930 def date_from_str(date_str):
932 Return a datetime object from a string in the format YYYYMMDD or
933 (now|today)[+-][0-9](day|week|month|year)(s)?"""
934 today = datetime.date.today()
935 if date_str == 'now'or date_str == 'today':
937 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
938 if match is not None:
939 sign = match.group('sign')
940 time = int(match.group('time'))
943 unit = match.group('unit')
952 delta = datetime.timedelta(**{unit: time})
954 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
956 def hyphenate_date(date_str):
958 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
959 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
960 if match is not None:
961 return '-'.join(match.groups())
965 class DateRange(object):
966 """Represents a time interval between two dates"""
967 def __init__(self, start=None, end=None):
968 """start and end must be strings in the format accepted by date"""
969 if start is not None:
970 self.start = date_from_str(start)
972 self.start = datetime.datetime.min.date()
974 self.end = date_from_str(end)
976 self.end = datetime.datetime.max.date()
977 if self.start > self.end:
978 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
981 """Returns a range that only contains the given day"""
983 def __contains__(self, date):
984 """Check if the date is in the range"""
985 if not isinstance(date, datetime.date):
986 date = date_from_str(date)
987 return self.start <= date <= self.end
989 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
993 """ Returns the platform name as a compat_str """
994 res = platform.platform()
995 if isinstance(res, bytes):
996 res = res.decode(preferredencoding())
998 assert isinstance(res, compat_str)
1002 def _windows_write_string(s, out):
1003 """ Returns True if the string was written using special methods,
1004 False if it has yet to be written out."""
1005 # Adapted from http://stackoverflow.com/a/3259271/35070
1008 import ctypes.wintypes
1016 fileno = out.fileno()
1017 except AttributeError:
1018 # If the output stream doesn't have a fileno, it's virtual
1020 if fileno not in WIN_OUTPUT_IDS:
1023 GetStdHandle = ctypes.WINFUNCTYPE(
1024 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1025 ("GetStdHandle", ctypes.windll.kernel32))
1026 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1028 WriteConsoleW = ctypes.WINFUNCTYPE(
1029 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1030 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1031 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1032 written = ctypes.wintypes.DWORD(0)
1034 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1035 FILE_TYPE_CHAR = 0x0002
1036 FILE_TYPE_REMOTE = 0x8000
1037 GetConsoleMode = ctypes.WINFUNCTYPE(
1038 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1039 ctypes.POINTER(ctypes.wintypes.DWORD))(
1040 ("GetConsoleMode", ctypes.windll.kernel32))
1041 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1043 def not_a_console(handle):
1044 if handle == INVALID_HANDLE_VALUE or handle is None:
1046 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1047 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1049 if not_a_console(h):
1052 def next_nonbmp_pos(s):
1054 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1055 except StopIteration:
1059 count = min(next_nonbmp_pos(s), 1024)
1061 ret = WriteConsoleW(
1062 h, s, count if count else 2, ctypes.byref(written), None)
1064 raise OSError('Failed to write string')
1065 if not count: # We just wrote a non-BMP character
1066 assert written.value == 2
1069 assert written.value > 0
1070 s = s[written.value:]
1074 def write_string(s, out=None, encoding=None):
1077 assert type(s) == compat_str
1079 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1080 if _windows_write_string(s, out):
1083 if ('b' in getattr(out, 'mode', '') or
1084 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1085 byt = s.encode(encoding or preferredencoding(), 'ignore')
1087 elif hasattr(out, 'buffer'):
1088 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1089 byt = s.encode(enc, 'ignore')
1090 out.buffer.write(byt)
1096 def bytes_to_intlist(bs):
1099 if isinstance(bs[0], int): # Python 3
1102 return [ord(c) for c in bs]
1105 def intlist_to_bytes(xs):
1108 if isinstance(chr(0), bytes): # Python 2
1109 return ''.join([chr(x) for x in xs])
1114 # Cross-platform file locking
1115 if sys.platform == 'win32':
1116 import ctypes.wintypes
1119 class OVERLAPPED(ctypes.Structure):
1121 ('Internal', ctypes.wintypes.LPVOID),
1122 ('InternalHigh', ctypes.wintypes.LPVOID),
1123 ('Offset', ctypes.wintypes.DWORD),
1124 ('OffsetHigh', ctypes.wintypes.DWORD),
1125 ('hEvent', ctypes.wintypes.HANDLE),
1128 kernel32 = ctypes.windll.kernel32
1129 LockFileEx = kernel32.LockFileEx
1130 LockFileEx.argtypes = [
1131 ctypes.wintypes.HANDLE, # hFile
1132 ctypes.wintypes.DWORD, # dwFlags
1133 ctypes.wintypes.DWORD, # dwReserved
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1135 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1136 ctypes.POINTER(OVERLAPPED) # Overlapped
1138 LockFileEx.restype = ctypes.wintypes.BOOL
1139 UnlockFileEx = kernel32.UnlockFileEx
1140 UnlockFileEx.argtypes = [
1141 ctypes.wintypes.HANDLE, # hFile
1142 ctypes.wintypes.DWORD, # dwReserved
1143 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1144 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1145 ctypes.POINTER(OVERLAPPED) # Overlapped
1147 UnlockFileEx.restype = ctypes.wintypes.BOOL
1148 whole_low = 0xffffffff
1149 whole_high = 0x7fffffff
1151 def _lock_file(f, exclusive):
1152 overlapped = OVERLAPPED()
1153 overlapped.Offset = 0
1154 overlapped.OffsetHigh = 0
1155 overlapped.hEvent = 0
1156 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1157 handle = msvcrt.get_osfhandle(f.fileno())
1158 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1159 whole_low, whole_high, f._lock_file_overlapped_p):
1160 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1162 def _unlock_file(f):
1163 assert f._lock_file_overlapped_p
1164 handle = msvcrt.get_osfhandle(f.fileno())
1165 if not UnlockFileEx(handle, 0,
1166 whole_low, whole_high, f._lock_file_overlapped_p):
1167 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1172 def _lock_file(f, exclusive):
1173 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1175 def _unlock_file(f):
1176 fcntl.flock(f, fcntl.LOCK_UN)
1179 class locked_file(object):
1180 def __init__(self, filename, mode, encoding=None):
1181 assert mode in ['r', 'a', 'w']
1182 self.f = io.open(filename, mode, encoding=encoding)
1185 def __enter__(self):
1186 exclusive = self.mode != 'r'
1188 _lock_file(self.f, exclusive)
1194 def __exit__(self, etype, value, traceback):
1196 _unlock_file(self.f)
1203 def write(self, *args):
1204 return self.f.write(*args)
1206 def read(self, *args):
1207 return self.f.read(*args)
1210 def shell_quote(args):
1212 encoding = sys.getfilesystemencoding()
1213 if encoding is None:
1216 if isinstance(a, bytes):
1217 # We may get a filename encoded with 'encodeFilename'
1218 a = a.decode(encoding)
1219 quoted_args.append(pipes.quote(a))
1220 return u' '.join(quoted_args)
1223 def takewhile_inclusive(pred, seq):
1224 """ Like itertools.takewhile, but include the latest evaluated element
1225 (the first element so that Not pred(e)) """
1232 def smuggle_url(url, data):
1233 """ Pass additional data in a URL for internal use. """
1235 sdata = compat_urllib_parse.urlencode(
1236 {u'__youtubedl_smuggle': json.dumps(data)})
1237 return url + u'#' + sdata
1240 def unsmuggle_url(smug_url, default=None):
1241 if not '#__youtubedl_smuggle' in smug_url:
1242 return smug_url, default
1243 url, _, sdata = smug_url.rpartition(u'#')
1244 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1245 data = json.loads(jsond)
1249 def format_bytes(bytes):
1252 if type(bytes) is str:
1253 bytes = float(bytes)
1257 exponent = int(math.log(bytes, 1024.0))
1258 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1259 converted = float(bytes) / float(1024 ** exponent)
1260 return u'%.2f%s' % (converted, suffix)
1263 def get_term_width():
1264 columns = os.environ.get('COLUMNS', None)
1269 sp = subprocess.Popen(
1271 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1272 out, err = sp.communicate()
1273 return int(out.split()[1])
1279 def month_by_name(name):
1280 """ Return the number of a month by (locale-independently) English name """
1283 u'January', u'February', u'March', u'April', u'May', u'June',
1284 u'July', u'August', u'September', u'October', u'November', u'December']
1286 return ENGLISH_NAMES.index(name) + 1
1291 def fix_xml_ampersands(xml_str):
1292 """Replace all the '&' by '&' in XML"""
1294 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1299 def setproctitle(title):
1300 assert isinstance(title, compat_str)
1302 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1305 title_bytes = title.encode('utf-8')
1306 buf = ctypes.create_string_buffer(len(title_bytes))
1307 buf.value = title_bytes
1309 libc.prctl(15, buf, 0, 0, 0)
1310 except AttributeError:
1311 return # Strange libc, just skip this
1314 def remove_start(s, start):
1315 if s.startswith(start):
1316 return s[len(start):]
1320 def remove_end(s, end):
1322 return s[:-len(end)]
1326 def url_basename(url):
1327 path = compat_urlparse.urlparse(url).path
1328 return path.strip(u'/').split(u'/')[-1]
1331 class HEADRequest(compat_urllib_request.Request):
1332 def get_method(self):
1336 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1339 v = getattr(v, get_attr, None)
1342 return default if v is None else (int(v) * invscale // scale)
1345 def str_or_none(v, default=None):
1346 return default if v is None else compat_str(v)
1349 def str_to_int(int_str):
1350 """ A more relaxed version of int_or_none """
1353 int_str = re.sub(r'[,\.\+]', u'', int_str)
1357 def float_or_none(v, scale=1, invscale=1, default=None):
1358 return default if v is None else (float(v) * invscale / scale)
1361 def parse_duration(s):
1368 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1371 res = int(m.group('secs'))
1373 res += int(m.group('mins')) * 60
1374 if m.group('hours'):
1375 res += int(m.group('hours')) * 60 * 60
1377 res += float(m.group('ms'))
1381 def prepend_extension(filename, ext):
1382 name, real_ext = os.path.splitext(filename)
1383 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1386 def check_executable(exe, args=[]):
1387 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1388 args can be a list of arguments for a short output (like -version) """
1390 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1396 class PagedList(object):
1398 # This is only useful for tests
1399 return len(self.getslice())
1402 class OnDemandPagedList(PagedList):
1403 def __init__(self, pagefunc, pagesize):
1404 self._pagefunc = pagefunc
1405 self._pagesize = pagesize
1407 def getslice(self, start=0, end=None):
1409 for pagenum in itertools.count(start // self._pagesize):
1410 firstid = pagenum * self._pagesize
1411 nextfirstid = pagenum * self._pagesize + self._pagesize
1412 if start >= nextfirstid:
1415 page_results = list(self._pagefunc(pagenum))
1418 start % self._pagesize
1419 if firstid <= start < nextfirstid
1423 ((end - 1) % self._pagesize) + 1
1424 if (end is not None and firstid <= end <= nextfirstid)
1427 if startv != 0 or endv is not None:
1428 page_results = page_results[startv:endv]
1429 res.extend(page_results)
1431 # A little optimization - if current page is not "full", ie. does
1432 # not contain page_size videos then we can assume that this page
1433 # is the last one - there are no more ids on further pages -
1434 # i.e. no need to query again.
1435 if len(page_results) + startv < self._pagesize:
1438 # If we got the whole page, but the next page is not interesting,
1439 # break out early as well
1440 if end == nextfirstid:
1445 class InAdvancePagedList(PagedList):
1446 def __init__(self, pagefunc, pagecount, pagesize):
1447 self._pagefunc = pagefunc
1448 self._pagecount = pagecount
1449 self._pagesize = pagesize
1451 def getslice(self, start=0, end=None):
1453 start_page = start // self._pagesize
1455 self._pagecount if end is None else (end // self._pagesize + 1))
1456 skip_elems = start - start_page * self._pagesize
1457 only_more = None if end is None else end - start
1458 for pagenum in range(start_page, end_page):
1459 page = list(self._pagefunc(pagenum))
1461 page = page[skip_elems:]
1463 if only_more is not None:
1464 if len(page) < only_more:
1465 only_more -= len(page)
1467 page = page[:only_more]
1474 def uppercase_escape(s):
1475 unicode_escape = codecs.getdecoder('unicode_escape')
1477 r'\\U[0-9a-fA-F]{8}',
1478 lambda m: unicode_escape(m.group(0))[0],
1482 def escape_rfc3986(s):
1483 """Escape non-ASCII characters as suggested by RFC 3986"""
1484 if sys.version_info < (3, 0) and isinstance(s, unicode):
1485 s = s.encode('utf-8')
1486 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1489 def escape_url(url):
1490 """Escape URL as suggested by RFC 3986"""
1491 url_parsed = compat_urllib_parse_urlparse(url)
1492 return url_parsed._replace(
1493 path=escape_rfc3986(url_parsed.path),
1494 params=escape_rfc3986(url_parsed.params),
1495 query=escape_rfc3986(url_parsed.query),
1496 fragment=escape_rfc3986(url_parsed.fragment)
1500 struct.pack(u'!I', 0)
1502 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1503 def struct_pack(spec, *args):
1504 if isinstance(spec, compat_str):
1505 spec = spec.encode('ascii')
1506 return struct.pack(spec, *args)
1508 def struct_unpack(spec, *args):
1509 if isinstance(spec, compat_str):
1510 spec = spec.encode('ascii')
1511 return struct.unpack(spec, *args)
1513 struct_pack = struct.pack
1514 struct_unpack = struct.unpack
1517 def read_batch_urls(batch_fd):
1519 if not isinstance(url, compat_str):
1520 url = url.decode('utf-8', 'replace')
1521 BOM_UTF8 = u'\xef\xbb\xbf'
1522 if url.startswith(BOM_UTF8):
1523 url = url[len(BOM_UTF8):]
1525 if url.startswith(('#', ';', ']')):
1529 with contextlib.closing(batch_fd) as fd:
1530 return [url for url in map(fixup, fd) if url]
1533 def urlencode_postdata(*args, **kargs):
1534 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1538 etree_iter = xml.etree.ElementTree.Element.iter
1539 except AttributeError: # Python <=2.6
1540 etree_iter = lambda n: n.findall('.//*')
1544 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1545 def doctype(self, name, pubid, system):
1546 pass # Ignore doctypes
1548 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1549 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1550 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1551 # Fix up XML parser in Python 2.x
1552 if sys.version_info < (3, 0):
1553 for n in etree_iter(tree):
1554 if n.text is not None:
1555 if not isinstance(n.text, compat_str):
1556 n.text = n.text.decode('utf-8')
1560 if sys.version_info < (3, 0) and sys.platform == 'win32':
1561 def compat_getpass(prompt, *args, **kwargs):
1562 if isinstance(prompt, compat_str):
1563 prompt = prompt.encode(preferredencoding())
1564 return getpass.getpass(prompt, *args, **kwargs)
1566 compat_getpass = getpass.getpass
1578 def parse_age_limit(s):
1581 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1582 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1585 def strip_jsonp(code):
1586 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1589 def js_to_json(code):
1592 if v in ('true', 'false', 'null'):
1594 if v.startswith('"'):
1596 if v.startswith("'"):
1598 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1605 res = re.sub(r'''(?x)
1606 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1607 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1608 [a-zA-Z_][a-zA-Z_0-9]*
1610 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1614 def qualities(quality_ids):
1615 """ Get a numeric quality value out of a list of possible values """
1618 return quality_ids.index(qid)
1624 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1627 subprocess_check_output = subprocess.check_output
1628 except AttributeError:
1629 def subprocess_check_output(*args, **kwargs):
1630 assert 'input' not in kwargs
1631 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1632 output, _ = p.communicate()
1635 raise subprocess.CalledProcessError(ret, p.args, output=output)
1639 def limit_length(s, length):
1640 """ Add ellipses to overly long strings """
1645 return s[:length - len(ELLIPSES)] + ELLIPSES