2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
42 compat_urllib_parse_urlparse,
43 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
59 def preferredencoding():
60 """Get preferred encoding.
62 Returns the best encoding scheme for the system, based on
63 locale.getpreferredencoding() and some further tweaks.
66 pref = locale.getpreferredencoding()
74 def write_json_file(obj, fn):
75 """ Encode obj as JSON and write it to fn, atomically """
79 'prefix': os.path.basename(fn) + '.',
80 'dir': os.path.dirname(fn),
84 # In Python 2.x, json.dump expects a bytestream.
85 # In Python 3.x, it writes to a character stream
86 if sys.version_info < (3, 0):
94 tf = tempfile.NamedTemporaryFile(**args)
99 os.rename(tf.name, fn)
108 if sys.version_info >= (2, 7):
109 def find_xpath_attr(node, xpath, key, val):
110 """ Find the xpath xpath[@key=val] """
111 assert re.match(r'^[a-zA-Z-]+$', key)
112 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
113 expr = xpath + u"[@%s='%s']" % (key, val)
114 return node.find(expr)
116 def find_xpath_attr(node, xpath, key, val):
117 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
118 # .//node does not match if a node is a direct child of . !
119 if isinstance(xpath, unicode):
120 xpath = xpath.encode('ascii')
122 for f in node.findall(xpath):
123 if f.attrib.get(key) == val:
127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
128 # the namespace parameter
129 def xpath_with_ns(path, ns_map):
130 components = [c.split(':') for c in path.split('/')]
134 replaced.append(c[0])
137 replaced.append('{%s}%s' % (ns_map[ns], tag))
138 return '/'.join(replaced)
141 def xpath_text(node, xpath, name=None, fatal=False):
142 if sys.version_info < (2, 7): # Crazy 2.6
143 xpath = xpath.encode('ascii')
148 name = xpath if name is None else name
149 raise ExtractorError('Could not find XML element %s' % name)
155 def get_element_by_id(id, html):
156 """Return the content of the tag with the specified ID in the passed HTML document"""
157 return get_element_by_attribute("id", id, html)
160 def get_element_by_attribute(attribute, value, html):
161 """Return the content of the tag with the specified attribute in the passed HTML document"""
163 m = re.search(r'''(?xs)
165 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
167 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
171 ''' % (re.escape(attribute), re.escape(value)), html)
175 res = m.group('content')
177 if res.startswith('"') or res.startswith("'"):
180 return unescapeHTML(res)
183 def clean_html(html):
184 """Clean an HTML snippet into a readable string"""
186 html = html.replace('\n', ' ')
187 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
188 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
190 html = re.sub('<.*?>', '', html)
191 # Replace html entities
192 html = unescapeHTML(html)
196 def sanitize_open(filename, open_mode):
197 """Try to open the given filename, and slightly tweak it if this fails.
199 Attempts to open the given filename. If this fails, it tries to change
200 the filename slightly, step by step, until it's either able to open it
201 or it fails and raises a final exception, like the standard open()
204 It returns the tuple (stream, definitive_file_name).
208 if sys.platform == 'win32':
210 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
211 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
212 stream = open(encodeFilename(filename), open_mode)
213 return (stream, filename)
214 except (IOError, OSError) as err:
215 if err.errno in (errno.EACCES,):
218 # In case of error, try to remove win32 forbidden chars
219 alt_filename = os.path.join(
220 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
221 for path_part in os.path.split(filename)
223 if alt_filename == filename:
226 # An exception here should be caught in the caller
227 stream = open(encodeFilename(filename), open_mode)
228 return (stream, alt_filename)
231 def timeconvert(timestr):
232 """Convert RFC 2822 defined time string into system timestamp"""
234 timetuple = email.utils.parsedate_tz(timestr)
235 if timetuple is not None:
236 timestamp = email.utils.mktime_tz(timetuple)
239 def sanitize_filename(s, restricted=False, is_id=False):
240 """Sanitizes a string so it could be used as part of a filename.
241 If restricted is set, use a stricter subset of allowed characters.
242 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
244 def replace_insane(char):
245 if char == '?' or ord(char) < 32 or ord(char) == 127:
248 return '' if restricted else '\''
250 return '_-' if restricted else ' -'
251 elif char in '\\/|*<>':
253 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
255 if restricted and ord(char) > 127:
259 result = u''.join(map(replace_insane, s))
261 while '__' in result:
262 result = result.replace('__', '_')
263 result = result.strip('_')
264 # Common case of "Foreign band name - English song title"
265 if restricted and result.startswith('-_'):
271 def orderedSet(iterable):
272 """ Remove all duplicates from the input iterable """
280 def _htmlentity_transform(entity):
281 """Transforms an HTML entity to a character."""
282 # Known non-numeric HTML entity
283 if entity in compat_html_entities.name2codepoint:
284 return compat_chr(compat_html_entities.name2codepoint[entity])
286 mobj = re.match(r'#(x?[0-9]+)', entity)
288 numstr = mobj.group(1)
289 if numstr.startswith(u'x'):
291 numstr = u'0%s' % numstr
294 return compat_chr(int(numstr, base))
296 # Unknown entity in name, return its literal representation
297 return (u'&%s;' % entity)
303 assert type(s) == compat_str
306 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
309 def encodeFilename(s, for_subprocess=False):
311 @param s The name of the file
314 assert type(s) == compat_str
316 # Python 3 has a Unicode API
317 if sys.version_info >= (3, 0):
320 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
321 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
322 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
323 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
324 if not for_subprocess:
327 # For subprocess calls, encode with locale encoding
328 # Refer to http://stackoverflow.com/a/9951851/35070
329 encoding = preferredencoding()
331 encoding = sys.getfilesystemencoding()
334 return s.encode(encoding, 'ignore')
337 def encodeArgument(s):
338 if not isinstance(s, compat_str):
339 # Legacy code that uses byte strings
340 # Uncomment the following line after fixing all post processors
341 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
342 s = s.decode('ascii')
343 return encodeFilename(s, True)
346 def decodeOption(optval):
349 if isinstance(optval, bytes):
350 optval = optval.decode(preferredencoding())
352 assert isinstance(optval, compat_str)
355 def formatSeconds(secs):
357 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
359 return '%d:%02d' % (secs // 60, secs % 60)
364 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
365 if sys.version_info < (3, 2):
368 class HTTPSConnectionV3(httplib.HTTPSConnection):
369 def __init__(self, *args, **kwargs):
370 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
373 sock = socket.create_connection((self.host, self.port), self.timeout)
374 if getattr(self, '_tunnel_host', False):
378 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
380 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
382 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
383 def https_open(self, req):
384 return self.do_open(HTTPSConnectionV3, req)
385 return HTTPSHandlerV3(**kwargs)
386 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
387 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
388 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
389 if opts_no_check_certificate:
390 context.verify_mode = ssl.CERT_NONE
391 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
393 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
394 context.verify_mode = (ssl.CERT_NONE
395 if opts_no_check_certificate
396 else ssl.CERT_REQUIRED)
397 context.set_default_verify_paths()
399 context.load_default_certs()
400 except AttributeError:
402 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
404 class ExtractorError(Exception):
405 """Error during info extraction."""
406 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
407 """ tb, if given, is the original traceback (so that it can be printed out).
408 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
411 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
413 if video_id is not None:
414 msg = video_id + ': ' + msg
416 msg += u' (caused by %r)' % cause
418 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
419 super(ExtractorError, self).__init__(msg)
422 self.exc_info = sys.exc_info() # preserve original exception
424 self.video_id = video_id
426 def format_traceback(self):
427 if self.traceback is None:
429 return u''.join(traceback.format_tb(self.traceback))
432 class RegexNotFoundError(ExtractorError):
433 """Error when a regex didn't match"""
437 class DownloadError(Exception):
438 """Download Error exception.
440 This exception may be thrown by FileDownloader objects if they are not
441 configured to continue on errors. They will contain the appropriate
444 def __init__(self, msg, exc_info=None):
445 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
446 super(DownloadError, self).__init__(msg)
447 self.exc_info = exc_info
450 class SameFileError(Exception):
451 """Same File exception.
453 This exception will be thrown by FileDownloader objects if they detect
454 multiple files would have to be downloaded to the same file on disk.
459 class PostProcessingError(Exception):
460 """Post Processing exception.
462 This exception may be raised by PostProcessor's .run() method to
463 indicate an error in the postprocessing task.
465 def __init__(self, msg):
468 class MaxDownloadsReached(Exception):
469 """ --max-downloads limit has been reached. """
473 class UnavailableVideoError(Exception):
474 """Unavailable Format exception.
476 This exception will be thrown when a video is requested
477 in a format that is not available for that video.
482 class ContentTooShortError(Exception):
483 """Content Too Short exception.
485 This exception may be raised by FileDownloader objects when a file they
486 download is too small for what the server announced first, indicating
487 the connection was probably interrupted.
493 def __init__(self, downloaded, expected):
494 self.downloaded = downloaded
495 self.expected = expected
497 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
498 """Handler for HTTP requests and responses.
500 This class, when installed with an OpenerDirector, automatically adds
501 the standard headers to every HTTP request and handles gzipped and
502 deflated responses from web servers. If compression is to be avoided in
503 a particular request, the original request in the program code only has
504 to include the HTTP header "Youtubedl-No-Compression", which will be
505 removed before making the real request.
507 Part of this code was copied from:
509 http://techknack.net/python-urllib2-handlers/
511 Andrew Rowls, the author of that code, agreed to release it to the
518 return zlib.decompress(data, -zlib.MAX_WBITS)
520 return zlib.decompress(data)
523 def addinfourl_wrapper(stream, headers, url, code):
524 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
525 return compat_urllib_request.addinfourl(stream, headers, url, code)
526 ret = compat_urllib_request.addinfourl(stream, headers, url)
530 def http_request(self, req):
531 for h, v in std_headers.items():
532 if h not in req.headers:
534 if 'Youtubedl-no-compression' in req.headers:
535 if 'Accept-encoding' in req.headers:
536 del req.headers['Accept-encoding']
537 del req.headers['Youtubedl-no-compression']
538 if 'Youtubedl-user-agent' in req.headers:
539 if 'User-agent' in req.headers:
540 del req.headers['User-agent']
541 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
542 del req.headers['Youtubedl-user-agent']
544 if sys.version_info < (2, 7) and '#' in req.get_full_url():
545 # Python 2.6 is brain-dead when it comes to fragments
546 req._Request__original = req._Request__original.partition('#')[0]
547 req._Request__r_type = req._Request__r_type.partition('#')[0]
551 def http_response(self, req, resp):
554 if resp.headers.get('Content-encoding', '') == 'gzip':
555 content = resp.read()
556 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
558 uncompressed = io.BytesIO(gz.read())
559 except IOError as original_ioerror:
560 # There may be junk add the end of the file
561 # See http://stackoverflow.com/q/4928560/35070 for details
562 for i in range(1, 1024):
564 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
565 uncompressed = io.BytesIO(gz.read())
570 raise original_ioerror
571 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
572 resp.msg = old_resp.msg
574 if resp.headers.get('Content-encoding', '') == 'deflate':
575 gz = io.BytesIO(self.deflate(resp.read()))
576 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
577 resp.msg = old_resp.msg
580 https_request = http_request
581 https_response = http_response
584 def parse_iso8601(date_str, delimiter='T'):
585 """ Return a UNIX timestamp from the given date """
591 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
594 timezone = datetime.timedelta()
596 date_str = date_str[:-len(m.group(0))]
597 if not m.group('sign'):
598 timezone = datetime.timedelta()
600 sign = 1 if m.group('sign') == '+' else -1
601 timezone = datetime.timedelta(
602 hours=sign * int(m.group('hours')),
603 minutes=sign * int(m.group('minutes')))
604 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
605 dt = datetime.datetime.strptime(date_str, date_format) - timezone
606 return calendar.timegm(dt.timetuple())
609 def unified_strdate(date_str):
610 """Return a string with the date in the format YYYYMMDD"""
617 date_str = date_str.replace(',', ' ')
618 # %z (UTC offset) is only supported in python>=3.2
619 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
620 format_expressions = [
625 '%b %dst %Y %I:%M%p',
626 '%b %dnd %Y %I:%M%p',
627 '%b %dth %Y %I:%M%p',
636 '%Y-%m-%d %H:%M:%S.%f',
639 '%Y-%m-%dT%H:%M:%SZ',
640 '%Y-%m-%dT%H:%M:%S.%fZ',
641 '%Y-%m-%dT%H:%M:%S.%f0Z',
643 '%Y-%m-%dT%H:%M:%S.%f',
646 for expression in format_expressions:
648 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
651 if upload_date is None:
652 timetuple = email.utils.parsedate_tz(date_str)
654 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
657 def determine_ext(url, default_ext=u'unknown_video'):
660 guess = url.partition(u'?')[0].rpartition(u'.')[2]
661 if re.match(r'^[A-Za-z0-9]+$', guess):
666 def subtitles_filename(filename, sub_lang, sub_format):
667 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
669 def date_from_str(date_str):
671 Return a datetime object from a string in the format YYYYMMDD or
672 (now|today)[+-][0-9](day|week|month|year)(s)?"""
673 today = datetime.date.today()
674 if date_str == 'now'or date_str == 'today':
676 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
677 if match is not None:
678 sign = match.group('sign')
679 time = int(match.group('time'))
682 unit = match.group('unit')
691 delta = datetime.timedelta(**{unit: time})
693 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
695 def hyphenate_date(date_str):
697 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
698 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
699 if match is not None:
700 return '-'.join(match.groups())
704 class DateRange(object):
705 """Represents a time interval between two dates"""
706 def __init__(self, start=None, end=None):
707 """start and end must be strings in the format accepted by date"""
708 if start is not None:
709 self.start = date_from_str(start)
711 self.start = datetime.datetime.min.date()
713 self.end = date_from_str(end)
715 self.end = datetime.datetime.max.date()
716 if self.start > self.end:
717 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
720 """Returns a range that only contains the given day"""
722 def __contains__(self, date):
723 """Check if the date is in the range"""
724 if not isinstance(date, datetime.date):
725 date = date_from_str(date)
726 return self.start <= date <= self.end
728 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
732 """ Returns the platform name as a compat_str """
733 res = platform.platform()
734 if isinstance(res, bytes):
735 res = res.decode(preferredencoding())
737 assert isinstance(res, compat_str)
741 def _windows_write_string(s, out):
742 """ Returns True if the string was written using special methods,
743 False if it has yet to be written out."""
744 # Adapted from http://stackoverflow.com/a/3259271/35070
747 import ctypes.wintypes
755 fileno = out.fileno()
756 except AttributeError:
757 # If the output stream doesn't have a fileno, it's virtual
759 if fileno not in WIN_OUTPUT_IDS:
762 GetStdHandle = ctypes.WINFUNCTYPE(
763 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
764 ("GetStdHandle", ctypes.windll.kernel32))
765 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
767 WriteConsoleW = ctypes.WINFUNCTYPE(
768 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
769 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
770 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
771 written = ctypes.wintypes.DWORD(0)
773 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
774 FILE_TYPE_CHAR = 0x0002
775 FILE_TYPE_REMOTE = 0x8000
776 GetConsoleMode = ctypes.WINFUNCTYPE(
777 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
778 ctypes.POINTER(ctypes.wintypes.DWORD))(
779 ("GetConsoleMode", ctypes.windll.kernel32))
780 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
782 def not_a_console(handle):
783 if handle == INVALID_HANDLE_VALUE or handle is None:
785 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
786 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
791 def next_nonbmp_pos(s):
793 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
794 except StopIteration:
798 count = min(next_nonbmp_pos(s), 1024)
801 h, s, count if count else 2, ctypes.byref(written), None)
803 raise OSError('Failed to write string')
804 if not count: # We just wrote a non-BMP character
805 assert written.value == 2
808 assert written.value > 0
809 s = s[written.value:]
813 def write_string(s, out=None, encoding=None):
816 assert type(s) == compat_str
818 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
819 if _windows_write_string(s, out):
822 if ('b' in getattr(out, 'mode', '') or
823 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
824 byt = s.encode(encoding or preferredencoding(), 'ignore')
826 elif hasattr(out, 'buffer'):
827 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
828 byt = s.encode(enc, 'ignore')
829 out.buffer.write(byt)
835 def bytes_to_intlist(bs):
838 if isinstance(bs[0], int): # Python 3
841 return [ord(c) for c in bs]
844 def intlist_to_bytes(xs):
847 if isinstance(chr(0), bytes): # Python 2
848 return ''.join([chr(x) for x in xs])
853 # Cross-platform file locking
854 if sys.platform == 'win32':
855 import ctypes.wintypes
858 class OVERLAPPED(ctypes.Structure):
860 ('Internal', ctypes.wintypes.LPVOID),
861 ('InternalHigh', ctypes.wintypes.LPVOID),
862 ('Offset', ctypes.wintypes.DWORD),
863 ('OffsetHigh', ctypes.wintypes.DWORD),
864 ('hEvent', ctypes.wintypes.HANDLE),
867 kernel32 = ctypes.windll.kernel32
868 LockFileEx = kernel32.LockFileEx
869 LockFileEx.argtypes = [
870 ctypes.wintypes.HANDLE, # hFile
871 ctypes.wintypes.DWORD, # dwFlags
872 ctypes.wintypes.DWORD, # dwReserved
873 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
874 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
875 ctypes.POINTER(OVERLAPPED) # Overlapped
877 LockFileEx.restype = ctypes.wintypes.BOOL
878 UnlockFileEx = kernel32.UnlockFileEx
879 UnlockFileEx.argtypes = [
880 ctypes.wintypes.HANDLE, # hFile
881 ctypes.wintypes.DWORD, # dwReserved
882 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
883 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
884 ctypes.POINTER(OVERLAPPED) # Overlapped
886 UnlockFileEx.restype = ctypes.wintypes.BOOL
887 whole_low = 0xffffffff
888 whole_high = 0x7fffffff
890 def _lock_file(f, exclusive):
891 overlapped = OVERLAPPED()
892 overlapped.Offset = 0
893 overlapped.OffsetHigh = 0
894 overlapped.hEvent = 0
895 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
896 handle = msvcrt.get_osfhandle(f.fileno())
897 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
898 whole_low, whole_high, f._lock_file_overlapped_p):
899 raise OSError('Locking file failed: %r' % ctypes.FormatError())
902 assert f._lock_file_overlapped_p
903 handle = msvcrt.get_osfhandle(f.fileno())
904 if not UnlockFileEx(handle, 0,
905 whole_low, whole_high, f._lock_file_overlapped_p):
906 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
911 def _lock_file(f, exclusive):
912 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
915 fcntl.flock(f, fcntl.LOCK_UN)
918 class locked_file(object):
919 def __init__(self, filename, mode, encoding=None):
920 assert mode in ['r', 'a', 'w']
921 self.f = io.open(filename, mode, encoding=encoding)
925 exclusive = self.mode != 'r'
927 _lock_file(self.f, exclusive)
933 def __exit__(self, etype, value, traceback):
942 def write(self, *args):
943 return self.f.write(*args)
945 def read(self, *args):
946 return self.f.read(*args)
949 def get_filesystem_encoding():
950 encoding = sys.getfilesystemencoding()
951 return encoding if encoding is not None else 'utf-8'
954 def shell_quote(args):
956 encoding = get_filesystem_encoding()
958 if isinstance(a, bytes):
959 # We may get a filename encoded with 'encodeFilename'
960 a = a.decode(encoding)
961 quoted_args.append(pipes.quote(a))
962 return u' '.join(quoted_args)
965 def takewhile_inclusive(pred, seq):
966 """ Like itertools.takewhile, but include the latest evaluated element
967 (the first element so that Not pred(e)) """
974 def smuggle_url(url, data):
975 """ Pass additional data in a URL for internal use. """
977 sdata = compat_urllib_parse.urlencode(
978 {u'__youtubedl_smuggle': json.dumps(data)})
979 return url + u'#' + sdata
982 def unsmuggle_url(smug_url, default=None):
983 if not '#__youtubedl_smuggle' in smug_url:
984 return smug_url, default
985 url, _, sdata = smug_url.rpartition(u'#')
986 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
987 data = json.loads(jsond)
991 def format_bytes(bytes):
994 if type(bytes) is str:
999 exponent = int(math.log(bytes, 1024.0))
1000 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1001 converted = float(bytes) / float(1024 ** exponent)
1002 return u'%.2f%s' % (converted, suffix)
1005 def get_term_width():
1006 columns = compat_getenv('COLUMNS', None)
1011 sp = subprocess.Popen(
1013 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1014 out, err = sp.communicate()
1015 return int(out.split()[1])
1021 def month_by_name(name):
1022 """ Return the number of a month by (locale-independently) English name """
1025 u'January', u'February', u'March', u'April', u'May', u'June',
1026 u'July', u'August', u'September', u'October', u'November', u'December']
1028 return ENGLISH_NAMES.index(name) + 1
1033 def fix_xml_ampersands(xml_str):
1034 """Replace all the '&' by '&' in XML"""
1036 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1041 def setproctitle(title):
1042 assert isinstance(title, compat_str)
1044 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1047 title_bytes = title.encode('utf-8')
1048 buf = ctypes.create_string_buffer(len(title_bytes))
1049 buf.value = title_bytes
1051 libc.prctl(15, buf, 0, 0, 0)
1052 except AttributeError:
1053 return # Strange libc, just skip this
1056 def remove_start(s, start):
1057 if s.startswith(start):
1058 return s[len(start):]
1062 def remove_end(s, end):
1064 return s[:-len(end)]
1068 def url_basename(url):
1069 path = compat_urlparse.urlparse(url).path
1070 return path.strip(u'/').split(u'/')[-1]
1073 class HEADRequest(compat_urllib_request.Request):
1074 def get_method(self):
1078 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1081 v = getattr(v, get_attr, None)
1084 return default if v is None else (int(v) * invscale // scale)
1087 def str_or_none(v, default=None):
1088 return default if v is None else compat_str(v)
1091 def str_to_int(int_str):
1092 """ A more relaxed version of int_or_none """
1095 int_str = re.sub(r'[,\.\+]', u'', int_str)
1099 def float_or_none(v, scale=1, invscale=1, default=None):
1100 return default if v is None else (float(v) * invscale / scale)
1103 def parse_duration(s):
1110 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1113 res = int(m.group('secs'))
1115 res += int(m.group('mins')) * 60
1116 if m.group('hours'):
1117 res += int(m.group('hours')) * 60 * 60
1119 res += float(m.group('ms'))
1123 def prepend_extension(filename, ext):
1124 name, real_ext = os.path.splitext(filename)
1125 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1128 def check_executable(exe, args=[]):
1129 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1130 args can be a list of arguments for a short output (like -version) """
1132 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1138 def get_exe_version(exe, args=['--version'],
1139 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1140 unrecognized=u'present'):
1141 """ Returns the version of the specified executable,
1142 or False if the executable is not present """
1144 out, err = subprocess.Popen(
1146 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1149 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1150 m = re.search(version_re, firstline)
1157 class PagedList(object):
1159 # This is only useful for tests
1160 return len(self.getslice())
1163 class OnDemandPagedList(PagedList):
1164 def __init__(self, pagefunc, pagesize):
1165 self._pagefunc = pagefunc
1166 self._pagesize = pagesize
1168 def getslice(self, start=0, end=None):
1170 for pagenum in itertools.count(start // self._pagesize):
1171 firstid = pagenum * self._pagesize
1172 nextfirstid = pagenum * self._pagesize + self._pagesize
1173 if start >= nextfirstid:
1176 page_results = list(self._pagefunc(pagenum))
1179 start % self._pagesize
1180 if firstid <= start < nextfirstid
1184 ((end - 1) % self._pagesize) + 1
1185 if (end is not None and firstid <= end <= nextfirstid)
1188 if startv != 0 or endv is not None:
1189 page_results = page_results[startv:endv]
1190 res.extend(page_results)
1192 # A little optimization - if current page is not "full", ie. does
1193 # not contain page_size videos then we can assume that this page
1194 # is the last one - there are no more ids on further pages -
1195 # i.e. no need to query again.
1196 if len(page_results) + startv < self._pagesize:
1199 # If we got the whole page, but the next page is not interesting,
1200 # break out early as well
1201 if end == nextfirstid:
1206 class InAdvancePagedList(PagedList):
1207 def __init__(self, pagefunc, pagecount, pagesize):
1208 self._pagefunc = pagefunc
1209 self._pagecount = pagecount
1210 self._pagesize = pagesize
1212 def getslice(self, start=0, end=None):
1214 start_page = start // self._pagesize
1216 self._pagecount if end is None else (end // self._pagesize + 1))
1217 skip_elems = start - start_page * self._pagesize
1218 only_more = None if end is None else end - start
1219 for pagenum in range(start_page, end_page):
1220 page = list(self._pagefunc(pagenum))
1222 page = page[skip_elems:]
1224 if only_more is not None:
1225 if len(page) < only_more:
1226 only_more -= len(page)
1228 page = page[:only_more]
1235 def uppercase_escape(s):
1236 unicode_escape = codecs.getdecoder('unicode_escape')
1238 r'\\U[0-9a-fA-F]{8}',
1239 lambda m: unicode_escape(m.group(0))[0],
1243 def escape_rfc3986(s):
1244 """Escape non-ASCII characters as suggested by RFC 3986"""
1245 if sys.version_info < (3, 0) and isinstance(s, unicode):
1246 s = s.encode('utf-8')
1247 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1250 def escape_url(url):
1251 """Escape URL as suggested by RFC 3986"""
1252 url_parsed = compat_urllib_parse_urlparse(url)
1253 return url_parsed._replace(
1254 path=escape_rfc3986(url_parsed.path),
1255 params=escape_rfc3986(url_parsed.params),
1256 query=escape_rfc3986(url_parsed.query),
1257 fragment=escape_rfc3986(url_parsed.fragment)
1261 struct.pack(u'!I', 0)
1263 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1264 def struct_pack(spec, *args):
1265 if isinstance(spec, compat_str):
1266 spec = spec.encode('ascii')
1267 return struct.pack(spec, *args)
1269 def struct_unpack(spec, *args):
1270 if isinstance(spec, compat_str):
1271 spec = spec.encode('ascii')
1272 return struct.unpack(spec, *args)
1274 struct_pack = struct.pack
1275 struct_unpack = struct.unpack
1278 def read_batch_urls(batch_fd):
1280 if not isinstance(url, compat_str):
1281 url = url.decode('utf-8', 'replace')
1282 BOM_UTF8 = u'\xef\xbb\xbf'
1283 if url.startswith(BOM_UTF8):
1284 url = url[len(BOM_UTF8):]
1286 if url.startswith(('#', ';', ']')):
1290 with contextlib.closing(batch_fd) as fd:
1291 return [url for url in map(fixup, fd) if url]
1294 def urlencode_postdata(*args, **kargs):
1295 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1299 etree_iter = xml.etree.ElementTree.Element.iter
1300 except AttributeError: # Python <=2.6
1301 etree_iter = lambda n: n.findall('.//*')
1305 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1306 def doctype(self, name, pubid, system):
1307 pass # Ignore doctypes
1309 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1310 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1311 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1312 # Fix up XML parser in Python 2.x
1313 if sys.version_info < (3, 0):
1314 for n in etree_iter(tree):
1315 if n.text is not None:
1316 if not isinstance(n.text, compat_str):
1317 n.text = n.text.decode('utf-8')
1330 def parse_age_limit(s):
1333 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1334 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1337 def strip_jsonp(code):
1338 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1341 def js_to_json(code):
1344 if v in ('true', 'false', 'null'):
1346 if v.startswith('"'):
1348 if v.startswith("'"):
1350 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1357 res = re.sub(r'''(?x)
1358 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1359 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1360 [a-zA-Z_][a-zA-Z_0-9]*
1362 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1366 def qualities(quality_ids):
1367 """ Get a numeric quality value out of a list of possible values """
1370 return quality_ids.index(qid)
1376 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1379 def limit_length(s, length):
1380 """ Add ellipses to overly long strings """
1385 return s[:length - len(ELLIPSES)] + ELLIPSES
1389 def version_tuple(v):
1390 return [int(e) for e in v.split('.')]
1393 def is_outdated_version(version, limit, assume_new=True):
1395 return not assume_new
1397 return version_tuple(version) < version_tuple(limit)
1399 return not assume_new