2 # -*- coding: utf-8 -*-
29 import xml.etree.ElementTree
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
95 from urllib.parse import unquote as compat_urllib_parse_unquote
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
100 res = string.split('%')
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
114 pct_sequence += item[:2].decode('hex')
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
123 # Encountered non-percent-encoded characters. Flush the current
125 string += pct_sequence.decode(encoding, errors) + rest
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
147 nv = name_value.split('=', 1)
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
177 parsed_result[name] = [value]
181 compat_str = unicode # Python 2
186 compat_chr = unichr # Python 2
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
199 return "'" + s.replace("'", "'\"'\"'") + "'"
203 if type(c) is int: return c
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
217 def preferredencoding():
218 """Get preferred encoding.
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
224 pref = locale.getpreferredencoding()
231 if sys.version_info < (3,0):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
236 assert type(s) == type(u'')
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
260 tf = tempfile.NamedTemporaryFile(**args)
265 os.rename(tf.name, fn)
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
295 replaced.append(c[0])
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
302 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
303 class BaseHTMLParser(compat_html_parser.HTMLParser):
305 compat_html_parser.HTMLParser.__init__(self)
308 def loads(self, html):
313 class AttrParser(BaseHTMLParser):
314 """Modified HTMLParser that isolates a tag with the specified attribute"""
315 def __init__(self, attribute, value):
316 self.attribute = attribute
321 self.watch_startpos = False
323 BaseHTMLParser.__init__(self)
325 def error(self, message):
326 if self.error_count > 10 or self.started:
327 raise compat_html_parser.HTMLParseError(message, self.getpos())
328 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
329 self.error_count += 1
332 def handle_starttag(self, tag, attrs):
335 self.find_startpos(None)
336 if self.attribute in attrs and attrs[self.attribute] == self.value:
339 self.watch_startpos = True
341 if not tag in self.depth: self.depth[tag] = 0
344 def handle_endtag(self, tag):
346 if tag in self.depth: self.depth[tag] -= 1
347 if self.depth[self.result[0]] == 0:
349 self.result.append(self.getpos())
351 def find_startpos(self, x):
352 """Needed to put the start position of the result (self.result[1])
353 after the opening tag with the requested id"""
354 if self.watch_startpos:
355 self.watch_startpos = False
356 self.result.append(self.getpos())
357 handle_entityref = handle_charref = handle_data = handle_comment = \
358 handle_decl = handle_pi = unknown_decl = find_startpos
360 def get_result(self):
361 if self.result is None:
363 if len(self.result) != 3:
365 lines = self.html.split('\n')
366 lines = lines[self.result[1][0]-1:self.result[2][0]]
367 lines[0] = lines[0][self.result[1][1]:]
369 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
370 lines[-1] = lines[-1][:self.result[2][1]]
371 return '\n'.join(lines).strip()
372 # Hack for https://github.com/rg3/youtube-dl/issues/662
373 if sys.version_info < (2, 7, 3):
374 AttrParser.parse_endtag = (lambda self, i:
375 i + len("</scr'+'ipt>")
376 if self.rawdata[i:].startswith("</scr'+'ipt>")
377 else compat_html_parser.HTMLParser.parse_endtag(self, i))
379 def get_element_by_id(id, html):
380 """Return the content of the tag with the specified ID in the passed HTML document"""
381 return get_element_by_attribute("id", id, html)
383 def get_element_by_attribute(attribute, value, html):
384 """Return the content of the tag with the specified attribute in the passed HTML document"""
385 parser = AttrParser(attribute, value)
388 except compat_html_parser.HTMLParseError:
390 return parser.get_result()
392 class MetaParser(BaseHTMLParser):
394 Modified HTMLParser that isolates a meta tag with the specified name
397 def __init__(self, name):
398 BaseHTMLParser.__init__(self)
403 def handle_starttag(self, tag, attrs):
407 if attrs.get('name') == self.name:
408 self.result = attrs.get('content')
410 def get_result(self):
413 def get_meta_content(name, html):
415 Return the content attribute from the meta tag with the given name attribute.
417 parser = MetaParser(name)
420 except compat_html_parser.HTMLParseError:
422 return parser.get_result()
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
428 html = html.replace('\n', ' ')
429 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
438 def sanitize_open(filename, open_mode):
439 """Try to open the given filename, and slightly tweak it if this fails.
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
446 It returns the tuple (stream, definitive_file_name).
450 if sys.platform == 'win32':
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
457 if err.errno in (errno.EACCES,):
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = os.path.join(
462 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
463 for path_part in os.path.split(filename)
465 if alt_filename == filename:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(filename), open_mode)
470 return (stream, alt_filename)
473 def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
481 def sanitize_filename(s, restricted=False, is_id=False):
482 """Sanitizes a string so it could be used as part of a filename.
483 If restricted is set, use a stricter subset of allowed characters.
484 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
486 def replace_insane(char):
487 if char == '?' or ord(char) < 32 or ord(char) == 127:
490 return '' if restricted else '\''
492 return '_-' if restricted else ' -'
493 elif char in '\\/|*<>':
495 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
497 if restricted and ord(char) > 127:
501 result = u''.join(map(replace_insane, s))
503 while '__' in result:
504 result = result.replace('__', '_')
505 result = result.strip('_')
506 # Common case of "Foreign band name - English song title"
507 if restricted and result.startswith('-_'):
513 def orderedSet(iterable):
514 """ Remove all duplicates from the input iterable """
522 def _htmlentity_transform(entity):
523 """Transforms an HTML entity to a character."""
524 # Known non-numeric HTML entity
525 if entity in compat_html_entities.name2codepoint:
526 return compat_chr(compat_html_entities.name2codepoint[entity])
528 mobj = re.match(r'#(x?[0-9]+)', entity)
530 numstr = mobj.group(1)
531 if numstr.startswith(u'x'):
533 numstr = u'0%s' % numstr
536 return compat_chr(int(numstr, base))
538 # Unknown entity in name, return its literal representation
539 return (u'&%s;' % entity)
545 assert type(s) == compat_str
548 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
551 def encodeFilename(s, for_subprocess=False):
553 @param s The name of the file
556 assert type(s) == compat_str
558 # Python 3 has a Unicode API
559 if sys.version_info >= (3, 0):
562 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
563 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
564 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
565 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
566 if not for_subprocess:
569 # For subprocess calls, encode with locale encoding
570 # Refer to http://stackoverflow.com/a/9951851/35070
571 encoding = preferredencoding()
573 encoding = sys.getfilesystemencoding()
576 return s.encode(encoding, 'ignore')
579 def encodeArgument(s):
580 if not isinstance(s, compat_str):
581 # Legacy code that uses byte strings
582 # Uncomment the following line after fixing all post processors
583 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
584 s = s.decode('ascii')
585 return encodeFilename(s, True)
588 def decodeOption(optval):
591 if isinstance(optval, bytes):
592 optval = optval.decode(preferredencoding())
594 assert isinstance(optval, compat_str)
597 def formatSeconds(secs):
599 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
601 return '%d:%02d' % (secs // 60, secs % 60)
606 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
607 if sys.version_info < (3, 2):
610 class HTTPSConnectionV3(httplib.HTTPSConnection):
611 def __init__(self, *args, **kwargs):
612 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
615 sock = socket.create_connection((self.host, self.port), self.timeout)
616 if getattr(self, '_tunnel_host', False):
620 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
622 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
624 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
625 def https_open(self, req):
626 return self.do_open(HTTPSConnectionV3, req)
627 return HTTPSHandlerV3(**kwargs)
628 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
629 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
630 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
631 if opts_no_check_certificate:
632 context.verify_mode = ssl.CERT_NONE
633 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
635 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
636 context.verify_mode = (ssl.CERT_NONE
637 if opts_no_check_certificate
638 else ssl.CERT_REQUIRED)
639 context.set_default_verify_paths()
641 context.load_default_certs()
642 except AttributeError:
644 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
646 class ExtractorError(Exception):
647 """Error during info extraction."""
648 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
649 """ tb, if given, is the original traceback (so that it can be printed out).
650 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
653 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
655 if video_id is not None:
656 msg = video_id + ': ' + msg
658 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
659 super(ExtractorError, self).__init__(msg)
662 self.exc_info = sys.exc_info() # preserve original exception
664 self.video_id = video_id
666 def format_traceback(self):
667 if self.traceback is None:
669 return u''.join(traceback.format_tb(self.traceback))
672 class RegexNotFoundError(ExtractorError):
673 """Error when a regex didn't match"""
677 class DownloadError(Exception):
678 """Download Error exception.
680 This exception may be thrown by FileDownloader objects if they are not
681 configured to continue on errors. They will contain the appropriate
684 def __init__(self, msg, exc_info=None):
685 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
686 super(DownloadError, self).__init__(msg)
687 self.exc_info = exc_info
690 class SameFileError(Exception):
691 """Same File exception.
693 This exception will be thrown by FileDownloader objects if they detect
694 multiple files would have to be downloaded to the same file on disk.
699 class PostProcessingError(Exception):
700 """Post Processing exception.
702 This exception may be raised by PostProcessor's .run() method to
703 indicate an error in the postprocessing task.
705 def __init__(self, msg):
708 class MaxDownloadsReached(Exception):
709 """ --max-downloads limit has been reached. """
713 class UnavailableVideoError(Exception):
714 """Unavailable Format exception.
716 This exception will be thrown when a video is requested
717 in a format that is not available for that video.
722 class ContentTooShortError(Exception):
723 """Content Too Short exception.
725 This exception may be raised by FileDownloader objects when a file they
726 download is too small for what the server announced first, indicating
727 the connection was probably interrupted.
733 def __init__(self, downloaded, expected):
734 self.downloaded = downloaded
735 self.expected = expected
737 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
738 """Handler for HTTP requests and responses.
740 This class, when installed with an OpenerDirector, automatically adds
741 the standard headers to every HTTP request and handles gzipped and
742 deflated responses from web servers. If compression is to be avoided in
743 a particular request, the original request in the program code only has
744 to include the HTTP header "Youtubedl-No-Compression", which will be
745 removed before making the real request.
747 Part of this code was copied from:
749 http://techknack.net/python-urllib2-handlers/
751 Andrew Rowls, the author of that code, agreed to release it to the
758 return zlib.decompress(data, -zlib.MAX_WBITS)
760 return zlib.decompress(data)
763 def addinfourl_wrapper(stream, headers, url, code):
764 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
765 return compat_urllib_request.addinfourl(stream, headers, url, code)
766 ret = compat_urllib_request.addinfourl(stream, headers, url)
770 def http_request(self, req):
771 for h, v in std_headers.items():
772 if h not in req.headers:
774 if 'Youtubedl-no-compression' in req.headers:
775 if 'Accept-encoding' in req.headers:
776 del req.headers['Accept-encoding']
777 del req.headers['Youtubedl-no-compression']
778 if 'Youtubedl-user-agent' in req.headers:
779 if 'User-agent' in req.headers:
780 del req.headers['User-agent']
781 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
782 del req.headers['Youtubedl-user-agent']
785 def http_response(self, req, resp):
788 if resp.headers.get('Content-encoding', '') == 'gzip':
789 content = resp.read()
790 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
792 uncompressed = io.BytesIO(gz.read())
793 except IOError as original_ioerror:
794 # There may be junk add the end of the file
795 # See http://stackoverflow.com/q/4928560/35070 for details
796 for i in range(1, 1024):
798 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
799 uncompressed = io.BytesIO(gz.read())
804 raise original_ioerror
805 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
806 resp.msg = old_resp.msg
808 if resp.headers.get('Content-encoding', '') == 'deflate':
809 gz = io.BytesIO(self.deflate(resp.read()))
810 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
811 resp.msg = old_resp.msg
814 https_request = http_request
815 https_response = http_response
818 def parse_iso8601(date_str, delimiter='T'):
819 """ Return a UNIX timestamp from the given date """
825 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
828 timezone = datetime.timedelta()
830 date_str = date_str[:-len(m.group(0))]
831 if not m.group('sign'):
832 timezone = datetime.timedelta()
834 sign = 1 if m.group('sign') == '+' else -1
835 timezone = datetime.timedelta(
836 hours=sign * int(m.group('hours')),
837 minutes=sign * int(m.group('minutes')))
838 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
839 dt = datetime.datetime.strptime(date_str, date_format) - timezone
840 return calendar.timegm(dt.timetuple())
843 def unified_strdate(date_str):
844 """Return a string with the date in the format YYYYMMDD"""
851 date_str = date_str.replace(',', ' ')
852 # %z (UTC offset) is only supported in python>=3.2
853 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
854 format_expressions = [
859 '%b %dst %Y %I:%M%p',
860 '%b %dnd %Y %I:%M%p',
861 '%b %dth %Y %I:%M%p',
871 '%Y-%m-%dT%H:%M:%SZ',
872 '%Y-%m-%dT%H:%M:%S.%fZ',
873 '%Y-%m-%dT%H:%M:%S.%f0Z',
875 '%Y-%m-%dT%H:%M:%S.%f',
878 for expression in format_expressions:
880 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
883 if upload_date is None:
884 timetuple = email.utils.parsedate_tz(date_str)
886 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
889 def determine_ext(url, default_ext=u'unknown_video'):
892 guess = url.partition(u'?')[0].rpartition(u'.')[2]
893 if re.match(r'^[A-Za-z0-9]+$', guess):
898 def subtitles_filename(filename, sub_lang, sub_format):
899 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
901 def date_from_str(date_str):
903 Return a datetime object from a string in the format YYYYMMDD or
904 (now|today)[+-][0-9](day|week|month|year)(s)?"""
905 today = datetime.date.today()
906 if date_str == 'now'or date_str == 'today':
908 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
909 if match is not None:
910 sign = match.group('sign')
911 time = int(match.group('time'))
914 unit = match.group('unit')
923 delta = datetime.timedelta(**{unit: time})
925 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
927 def hyphenate_date(date_str):
929 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
930 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
931 if match is not None:
932 return '-'.join(match.groups())
936 class DateRange(object):
937 """Represents a time interval between two dates"""
938 def __init__(self, start=None, end=None):
939 """start and end must be strings in the format accepted by date"""
940 if start is not None:
941 self.start = date_from_str(start)
943 self.start = datetime.datetime.min.date()
945 self.end = date_from_str(end)
947 self.end = datetime.datetime.max.date()
948 if self.start > self.end:
949 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
952 """Returns a range that only contains the given day"""
954 def __contains__(self, date):
955 """Check if the date is in the range"""
956 if not isinstance(date, datetime.date):
957 date = date_from_str(date)
958 return self.start <= date <= self.end
960 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
964 """ Returns the platform name as a compat_str """
965 res = platform.platform()
966 if isinstance(res, bytes):
967 res = res.decode(preferredencoding())
969 assert isinstance(res, compat_str)
973 def _windows_write_string(s, out):
974 """ Returns True if the string was written using special methods,
975 False if it has yet to be written out."""
976 # Adapted from http://stackoverflow.com/a/3259271/35070
979 import ctypes.wintypes
987 fileno = out.fileno()
988 except AttributeError:
989 # If the output stream doesn't have a fileno, it's virtual
991 if fileno not in WIN_OUTPUT_IDS:
994 GetStdHandle = ctypes.WINFUNCTYPE(
995 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
996 ("GetStdHandle", ctypes.windll.kernel32))
997 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
999 WriteConsoleW = ctypes.WINFUNCTYPE(
1000 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1001 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1002 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1003 written = ctypes.wintypes.DWORD(0)
1005 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1006 FILE_TYPE_CHAR = 0x0002
1007 FILE_TYPE_REMOTE = 0x8000
1008 GetConsoleMode = ctypes.WINFUNCTYPE(
1009 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1010 ctypes.POINTER(ctypes.wintypes.DWORD))(
1011 ("GetConsoleMode", ctypes.windll.kernel32))
1012 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1014 def not_a_console(handle):
1015 if handle == INVALID_HANDLE_VALUE or handle is None:
1017 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1018 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1020 if not_a_console(h):
1023 def next_nonbmp_pos(s):
1025 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1026 except StopIteration:
1030 count = min(next_nonbmp_pos(s), 1024)
1032 ret = WriteConsoleW(
1033 h, s, count if count else 2, ctypes.byref(written), None)
1035 raise OSError('Failed to write string')
1036 if not count: # We just wrote a non-BMP character
1037 assert written.value == 2
1040 assert written.value > 0
1041 s = s[written.value:]
1045 def write_string(s, out=None, encoding=None):
1048 assert type(s) == compat_str
1050 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1051 if _windows_write_string(s, out):
1054 if ('b' in getattr(out, 'mode', '') or
1055 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1056 byt = s.encode(encoding or preferredencoding(), 'ignore')
1058 elif hasattr(out, 'buffer'):
1059 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1060 byt = s.encode(enc, 'ignore')
1061 out.buffer.write(byt)
1067 def bytes_to_intlist(bs):
1070 if isinstance(bs[0], int): # Python 3
1073 return [ord(c) for c in bs]
1076 def intlist_to_bytes(xs):
1079 if isinstance(chr(0), bytes): # Python 2
1080 return ''.join([chr(x) for x in xs])
1085 # Cross-platform file locking
1086 if sys.platform == 'win32':
1087 import ctypes.wintypes
1090 class OVERLAPPED(ctypes.Structure):
1092 ('Internal', ctypes.wintypes.LPVOID),
1093 ('InternalHigh', ctypes.wintypes.LPVOID),
1094 ('Offset', ctypes.wintypes.DWORD),
1095 ('OffsetHigh', ctypes.wintypes.DWORD),
1096 ('hEvent', ctypes.wintypes.HANDLE),
1099 kernel32 = ctypes.windll.kernel32
1100 LockFileEx = kernel32.LockFileEx
1101 LockFileEx.argtypes = [
1102 ctypes.wintypes.HANDLE, # hFile
1103 ctypes.wintypes.DWORD, # dwFlags
1104 ctypes.wintypes.DWORD, # dwReserved
1105 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1106 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1107 ctypes.POINTER(OVERLAPPED) # Overlapped
1109 LockFileEx.restype = ctypes.wintypes.BOOL
1110 UnlockFileEx = kernel32.UnlockFileEx
1111 UnlockFileEx.argtypes = [
1112 ctypes.wintypes.HANDLE, # hFile
1113 ctypes.wintypes.DWORD, # dwReserved
1114 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1115 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1116 ctypes.POINTER(OVERLAPPED) # Overlapped
1118 UnlockFileEx.restype = ctypes.wintypes.BOOL
1119 whole_low = 0xffffffff
1120 whole_high = 0x7fffffff
1122 def _lock_file(f, exclusive):
1123 overlapped = OVERLAPPED()
1124 overlapped.Offset = 0
1125 overlapped.OffsetHigh = 0
1126 overlapped.hEvent = 0
1127 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1128 handle = msvcrt.get_osfhandle(f.fileno())
1129 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1130 whole_low, whole_high, f._lock_file_overlapped_p):
1131 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1133 def _unlock_file(f):
1134 assert f._lock_file_overlapped_p
1135 handle = msvcrt.get_osfhandle(f.fileno())
1136 if not UnlockFileEx(handle, 0,
1137 whole_low, whole_high, f._lock_file_overlapped_p):
1138 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1143 def _lock_file(f, exclusive):
1144 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1146 def _unlock_file(f):
1147 fcntl.flock(f, fcntl.LOCK_UN)
1150 class locked_file(object):
1151 def __init__(self, filename, mode, encoding=None):
1152 assert mode in ['r', 'a', 'w']
1153 self.f = io.open(filename, mode, encoding=encoding)
1156 def __enter__(self):
1157 exclusive = self.mode != 'r'
1159 _lock_file(self.f, exclusive)
1165 def __exit__(self, etype, value, traceback):
1167 _unlock_file(self.f)
1174 def write(self, *args):
1175 return self.f.write(*args)
1177 def read(self, *args):
1178 return self.f.read(*args)
1181 def shell_quote(args):
1183 encoding = sys.getfilesystemencoding()
1184 if encoding is None:
1187 if isinstance(a, bytes):
1188 # We may get a filename encoded with 'encodeFilename'
1189 a = a.decode(encoding)
1190 quoted_args.append(pipes.quote(a))
1191 return u' '.join(quoted_args)
1194 def takewhile_inclusive(pred, seq):
1195 """ Like itertools.takewhile, but include the latest evaluated element
1196 (the first element so that Not pred(e)) """
1203 def smuggle_url(url, data):
1204 """ Pass additional data in a URL for internal use. """
1206 sdata = compat_urllib_parse.urlencode(
1207 {u'__youtubedl_smuggle': json.dumps(data)})
1208 return url + u'#' + sdata
1211 def unsmuggle_url(smug_url, default=None):
1212 if not '#__youtubedl_smuggle' in smug_url:
1213 return smug_url, default
1214 url, _, sdata = smug_url.rpartition(u'#')
1215 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1216 data = json.loads(jsond)
1220 def format_bytes(bytes):
1223 if type(bytes) is str:
1224 bytes = float(bytes)
1228 exponent = int(math.log(bytes, 1024.0))
1229 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1230 converted = float(bytes) / float(1024 ** exponent)
1231 return u'%.2f%s' % (converted, suffix)
1234 def get_term_width():
1235 columns = os.environ.get('COLUMNS', None)
1240 sp = subprocess.Popen(
1242 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1243 out, err = sp.communicate()
1244 return int(out.split()[1])
1250 def month_by_name(name):
1251 """ Return the number of a month by (locale-independently) English name """
1254 u'January', u'February', u'March', u'April', u'May', u'June',
1255 u'July', u'August', u'September', u'October', u'November', u'December']
1257 return ENGLISH_NAMES.index(name) + 1
1262 def fix_xml_ampersands(xml_str):
1263 """Replace all the '&' by '&' in XML"""
1265 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1270 def setproctitle(title):
1271 assert isinstance(title, compat_str)
1273 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1276 title_bytes = title.encode('utf-8')
1277 buf = ctypes.create_string_buffer(len(title_bytes))
1278 buf.value = title_bytes
1280 libc.prctl(15, buf, 0, 0, 0)
1281 except AttributeError:
1282 return # Strange libc, just skip this
1285 def remove_start(s, start):
1286 if s.startswith(start):
1287 return s[len(start):]
1291 def remove_end(s, end):
1293 return s[:-len(end)]
1297 def url_basename(url):
1298 path = compat_urlparse.urlparse(url).path
1299 return path.strip(u'/').split(u'/')[-1]
1302 class HEADRequest(compat_urllib_request.Request):
1303 def get_method(self):
1307 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1310 v = getattr(v, get_attr, None)
1313 return default if v is None else (int(v) * invscale // scale)
1316 def str_or_none(v, default=None):
1317 return default if v is None else compat_str(v)
1320 def str_to_int(int_str):
1321 """ A more relaxed version of int_or_none """
1324 int_str = re.sub(r'[,\.\+]', u'', int_str)
1328 def float_or_none(v, scale=1, invscale=1, default=None):
1329 return default if v is None else (float(v) * invscale / scale)
1332 def parse_duration(s):
1339 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1342 res = int(m.group('secs'))
1344 res += int(m.group('mins')) * 60
1345 if m.group('hours'):
1346 res += int(m.group('hours')) * 60 * 60
1348 res += float(m.group('ms'))
1352 def prepend_extension(filename, ext):
1353 name, real_ext = os.path.splitext(filename)
1354 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1357 def check_executable(exe, args=[]):
1358 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359 args can be a list of arguments for a short output (like -version) """
1361 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1367 class PagedList(object):
1368 def __init__(self, pagefunc, pagesize):
1369 self._pagefunc = pagefunc
1370 self._pagesize = pagesize
1373 # This is only useful for tests
1374 return len(self.getslice())
1376 def getslice(self, start=0, end=None):
1378 for pagenum in itertools.count(start // self._pagesize):
1379 firstid = pagenum * self._pagesize
1380 nextfirstid = pagenum * self._pagesize + self._pagesize
1381 if start >= nextfirstid:
1384 page_results = list(self._pagefunc(pagenum))
1387 start % self._pagesize
1388 if firstid <= start < nextfirstid
1392 ((end - 1) % self._pagesize) + 1
1393 if (end is not None and firstid <= end <= nextfirstid)
1396 if startv != 0 or endv is not None:
1397 page_results = page_results[startv:endv]
1398 res.extend(page_results)
1400 # A little optimization - if current page is not "full", ie. does
1401 # not contain page_size videos then we can assume that this page
1402 # is the last one - there are no more ids on further pages -
1403 # i.e. no need to query again.
1404 if len(page_results) + startv < self._pagesize:
1407 # If we got the whole page, but the next page is not interesting,
1408 # break out early as well
1409 if end == nextfirstid:
1414 def uppercase_escape(s):
1415 unicode_escape = codecs.getdecoder('unicode_escape')
1417 r'\\U[0-9a-fA-F]{8}',
1418 lambda m: unicode_escape(m.group(0))[0],
1422 def escape_rfc3986(s):
1423 """Escape non-ASCII characters as suggested by RFC 3986"""
1424 if sys.version_info < (3, 0) and isinstance(s, unicode):
1425 s = s.encode('utf-8')
1426 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") #"%/;:@&=+$,!~*'()?#[]+" #?#[]+
1429 def escape_url(url):
1430 """Escape URL as suggested by RFC 3986"""
1431 url_parsed = compat_urllib_parse_urlparse(url)
1432 return url_parsed._replace(
1433 path=escape_rfc3986(url_parsed.path),
1434 params=escape_rfc3986(url_parsed.params),
1435 query=escape_rfc3986(url_parsed.query),
1436 fragment=escape_rfc3986(url_parsed.fragment)
1440 struct.pack(u'!I', 0)
1442 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1443 def struct_pack(spec, *args):
1444 if isinstance(spec, compat_str):
1445 spec = spec.encode('ascii')
1446 return struct.pack(spec, *args)
1448 def struct_unpack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.unpack(spec, *args)
1453 struct_pack = struct.pack
1454 struct_unpack = struct.unpack
1457 def read_batch_urls(batch_fd):
1459 if not isinstance(url, compat_str):
1460 url = url.decode('utf-8', 'replace')
1461 BOM_UTF8 = u'\xef\xbb\xbf'
1462 if url.startswith(BOM_UTF8):
1463 url = url[len(BOM_UTF8):]
1465 if url.startswith(('#', ';', ']')):
1469 with contextlib.closing(batch_fd) as fd:
1470 return [url for url in map(fixup, fd) if url]
1473 def urlencode_postdata(*args, **kargs):
1474 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1478 etree_iter = xml.etree.ElementTree.Element.iter
1479 except AttributeError: # Python <=2.6
1480 etree_iter = lambda n: n.findall('.//*')
1484 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1485 def doctype(self, name, pubid, system):
1486 pass # Ignore doctypes
1488 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1489 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1490 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1491 # Fix up XML parser in Python 2.x
1492 if sys.version_info < (3, 0):
1493 for n in etree_iter(tree):
1494 if n.text is not None:
1495 if not isinstance(n.text, compat_str):
1496 n.text = n.text.decode('utf-8')
1500 if sys.version_info < (3, 0) and sys.platform == 'win32':
1501 def compat_getpass(prompt, *args, **kwargs):
1502 if isinstance(prompt, compat_str):
1503 prompt = prompt.encode(preferredencoding())
1504 return getpass.getpass(prompt, *args, **kwargs)
1506 compat_getpass = getpass.getpass
1518 def strip_jsonp(code):
1519 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1522 def js_to_json(code):
1525 if key.startswith("'"):
1526 assert key.endswith("'")
1527 assert '"' not in key
1528 key = '"%s"' % key[1:-1]
1529 elif not key.startswith('"'):
1533 if value.startswith("'"):
1534 assert value.endswith("'")
1535 assert '"' not in value
1536 value = '"%s"' % value[1:-1]
1538 return m.group(1) + key + m.group(3) + value
1540 res = re.sub(r'''(?x)
1542 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1544 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1546 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1550 def qualities(quality_ids):
1551 """ Get a numeric quality value out of a list of possible values """
1554 return quality_ids.index(qid)
1560 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1563 subprocess_check_output = subprocess.check_output
1564 except AttributeError:
1565 def subprocess_check_output(*args, **kwargs):
1566 assert 'input' not in kwargs
1567 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1568 output, _ = p.communicate()
1571 raise subprocess.CalledProcessError(ret, p.args, output=output)