2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
32 import xml.etree.ElementTree
42 compat_socket_create_connection,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
65 ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
70 def preferredencoding():
71 """Get preferred encoding.
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
77 pref = locale.getpreferredencoding()
85 def write_json_file(obj, fn):
86 """ Encode obj as JSON and write it to fn, atomically if possible """
88 fn = encodeFilename(fn)
89 if sys.version_info < (3, 0) and sys.platform != 'win32':
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
118 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
130 os.rename(tf.name, fn)
139 if sys.version_info >= (2, 7):
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
144 expr = xpath + "[@%s='%s']" % (key, val)
145 return node.find(expr)
147 def find_xpath_attr(node, xpath, key, val):
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
150 if isinstance(xpath, compat_str):
151 xpath = xpath.encode('ascii')
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
159 # the namespace parameter
162 def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
167 replaced.append(c[0])
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
174 def xpath_text(node, xpath, name=None, fatal=False):
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
179 if n is None or n.text is None:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
188 def get_element_by_id(id, html):
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
193 def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
196 m = re.search(r'''(?xs)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
204 ''' % (re.escape(attribute), re.escape(value)), html)
208 res = m.group('content')
210 if res.startswith('"') or res.startswith("'"):
213 return unescapeHTML(res)
216 def clean_html(html):
217 """Clean an HTML snippet into a readable string"""
219 if html is None: # Convenience for sanitizing descriptions etc.
223 html = html.replace('\n', ' ')
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
233 def sanitize_open(filename, open_mode):
234 """Try to open the given filename, and slightly tweak it if this fails.
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
241 It returns the tuple (stream, definitive_file_name).
245 if sys.platform == 'win32':
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
252 if err.errno in (errno.EACCES,):
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = sanitize_path(filename)
257 if alt_filename == filename:
260 # An exception here should be caught in the caller
261 stream = open(encodeFilename(alt_filename), open_mode)
262 return (stream, alt_filename)
265 def timeconvert(timestr):
266 """Convert RFC 2822 defined time string into system timestamp"""
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
274 def sanitize_filename(s, restricted=False, is_id=False):
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
279 def replace_insane(char):
280 if char == '?' or ord(char) < 32 or ord(char) == 127:
283 return '' if restricted else '\''
285 return '_-' if restricted else ' -'
286 elif char in '\\/|*<>':
288 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
290 if restricted and ord(char) > 127:
295 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
296 result = ''.join(map(replace_insane, s))
298 while '__' in result:
299 result = result.replace('__', '_')
300 result = result.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted and result.startswith('-_'):
304 if result.startswith('-'):
305 result = '_' + result[len('-'):]
306 result = result.lstrip('.')
312 def sanitize_path(s):
313 """Sanitizes and normalizes path on Windows"""
314 if sys.platform != 'win32':
316 drive_or_unc, _ = os.path.splitdrive(s)
317 if sys.version_info < (2, 7) and not drive_or_unc:
318 drive_or_unc, _ = os.path.splitunc(s)
319 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
323 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
324 for path_part in norm_path]
326 sanitized_path.insert(0, drive_or_unc + os.path.sep)
327 return os.path.join(*sanitized_path)
330 def sanitize_url_path_consecutive_slashes(url):
331 """Collapses consecutive slashes in URLs' path"""
332 parsed_url = list(compat_urlparse.urlparse(url))
333 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
334 return compat_urlparse.urlunparse(parsed_url)
337 def orderedSet(iterable):
338 """ Remove all duplicates from the input iterable """
346 def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
354 numstr = mobj.group(1)
355 if numstr.startswith('x'):
357 numstr = '0%s' % numstr
360 return compat_chr(int(numstr, base))
362 # Unknown entity in name, return its literal representation
363 return ('&%s;' % entity)
369 assert type(s) == compat_str
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
375 def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
381 encoding = sys.getfilesystemencoding()
387 def encodeFilename(s, for_subprocess=False):
389 @param s The name of the file
392 assert type(s) == compat_str
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
404 return s.encode(get_subprocess_encoding(), 'ignore')
407 def decodeFilename(b, for_subprocess=False):
409 if sys.version_info >= (3, 0):
412 if not isinstance(b, bytes):
415 return b.decode(get_subprocess_encoding(), 'ignore')
418 def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
427 def decodeArgument(b):
428 return decodeFilename(b, True)
431 def decodeOption(optval):
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
437 assert isinstance(optval, compat_str)
441 def formatSeconds(secs):
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
445 return '%d:%02d' % (secs // 60, secs % 60)
450 def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
454 if opts_no_check_certificate:
455 context.check_hostname = False
456 context.verify_mode = ssl.CERT_NONE
458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
461 # (create_default_context present but HTTPSHandler has no context=)
464 if sys.version_info < (3, 2):
465 return YoutubeDLHTTPSHandler(params, **kwargs)
467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
468 context.verify_mode = (ssl.CERT_NONE
469 if opts_no_check_certificate
470 else ssl.CERT_REQUIRED)
471 context.set_default_verify_paths()
472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
475 def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
486 class ExtractorError(Exception):
487 """Error during info extraction."""
489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
496 if video_id is not None:
497 msg = video_id + ': ' + msg
499 msg += ' (caused by %r)' % cause
501 msg += bug_reports_message()
502 super(ExtractorError, self).__init__(msg)
505 self.exc_info = sys.exc_info() # preserve original exception
507 self.video_id = video_id
509 def format_traceback(self):
510 if self.traceback is None:
512 return ''.join(traceback.format_tb(self.traceback))
515 class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
522 class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
527 class DownloadError(Exception):
528 """Download Error exception.
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
541 class SameFileError(Exception):
542 """Same File exception.
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
550 class PostProcessingError(Exception):
551 """Post Processing exception.
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
557 def __init__(self, msg):
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
586 def __init__(self, downloaded, expected):
587 self.downloaded = downloaded
588 self.expected = expected
591 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
592 hc = http_class(*args, **kwargs)
593 source_address = ydl_handler._params.get('source_address')
594 if source_address is not None:
595 sa = (source_address, 0)
596 if hasattr(hc, 'source_address'): # Python 2.7+
597 hc.source_address = sa
599 def _hc_connect(self, *args, **kwargs):
600 sock = compat_socket_create_connection(
601 (self.host, self.port), self.timeout, sa)
603 self.sock = ssl.wrap_socket(
604 sock, self.key_file, self.cert_file,
605 ssl_version=ssl.PROTOCOL_TLSv1)
608 hc.connect = functools.partial(_hc_connect, hc)
613 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
614 """Handler for HTTP requests and responses.
616 This class, when installed with an OpenerDirector, automatically adds
617 the standard headers to every HTTP request and handles gzipped and
618 deflated responses from web servers. If compression is to be avoided in
619 a particular request, the original request in the program code only has
620 to include the HTTP header "Youtubedl-No-Compression", which will be
621 removed before making the real request.
623 Part of this code was copied from:
625 http://techknack.net/python-urllib2-handlers/
627 Andrew Rowls, the author of that code, agreed to release it to the
631 def __init__(self, params, *args, **kwargs):
632 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
633 self._params = params
635 def http_open(self, req):
636 return self.do_open(functools.partial(
637 _create_http_connection, self, compat_http_client.HTTPConnection, False),
643 return zlib.decompress(data, -zlib.MAX_WBITS)
645 return zlib.decompress(data)
648 def addinfourl_wrapper(stream, headers, url, code):
649 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
650 return compat_urllib_request.addinfourl(stream, headers, url, code)
651 ret = compat_urllib_request.addinfourl(stream, headers, url)
655 def http_request(self, req):
656 for h, v in std_headers.items():
657 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
658 # The dict keys are capitalized because of this bug by urllib
659 if h.capitalize() not in req.headers:
661 if 'Youtubedl-no-compression' in req.headers:
662 if 'Accept-encoding' in req.headers:
663 del req.headers['Accept-encoding']
664 del req.headers['Youtubedl-no-compression']
666 if sys.version_info < (2, 7) and '#' in req.get_full_url():
667 # Python 2.6 is brain-dead when it comes to fragments
668 req._Request__original = req._Request__original.partition('#')[0]
669 req._Request__r_type = req._Request__r_type.partition('#')[0]
673 def http_response(self, req, resp):
676 if resp.headers.get('Content-encoding', '') == 'gzip':
677 content = resp.read()
678 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
680 uncompressed = io.BytesIO(gz.read())
681 except IOError as original_ioerror:
682 # There may be junk add the end of the file
683 # See http://stackoverflow.com/q/4928560/35070 for details
684 for i in range(1, 1024):
686 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
687 uncompressed = io.BytesIO(gz.read())
692 raise original_ioerror
693 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
694 resp.msg = old_resp.msg
696 if resp.headers.get('Content-encoding', '') == 'deflate':
697 gz = io.BytesIO(self.deflate(resp.read()))
698 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
699 resp.msg = old_resp.msg
702 https_request = http_request
703 https_response = http_response
706 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
707 def __init__(self, params, https_conn_class=None, *args, **kwargs):
708 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
709 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
710 self._params = params
712 def https_open(self, req):
714 if hasattr(self, '_context'): # python > 2.6
715 kwargs['context'] = self._context
716 if hasattr(self, '_check_hostname'): # python 3.x
717 kwargs['check_hostname'] = self._check_hostname
718 return self.do_open(functools.partial(
719 _create_http_connection, self, self._https_conn_class, True),
723 def parse_iso8601(date_str, delimiter='T', timezone=None):
724 """ Return a UNIX timestamp from the given date """
731 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
734 timezone = datetime.timedelta()
736 date_str = date_str[:-len(m.group(0))]
737 if not m.group('sign'):
738 timezone = datetime.timedelta()
740 sign = 1 if m.group('sign') == '+' else -1
741 timezone = datetime.timedelta(
742 hours=sign * int(m.group('hours')),
743 minutes=sign * int(m.group('minutes')))
744 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
745 dt = datetime.datetime.strptime(date_str, date_format) - timezone
746 return calendar.timegm(dt.timetuple())
749 def unified_strdate(date_str, day_first=True):
750 """Return a string with the date in the format YYYYMMDD"""
756 date_str = date_str.replace(',', ' ')
757 # %z (UTC offset) is only supported in python>=3.2
758 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
759 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
760 # Remove AM/PM + timezone
761 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
763 format_expressions = [
768 '%b %dst %Y %I:%M%p',
769 '%b %dnd %Y %I:%M%p',
770 '%b %dth %Y %I:%M%p',
776 '%Y-%m-%d %H:%M:%S.%f',
779 '%Y-%m-%dT%H:%M:%SZ',
780 '%Y-%m-%dT%H:%M:%S.%fZ',
781 '%Y-%m-%dT%H:%M:%S.%f0Z',
783 '%Y-%m-%dT%H:%M:%S.%f',
787 format_expressions.extend([
795 format_expressions.extend([
802 for expression in format_expressions:
804 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
807 if upload_date is None:
808 timetuple = email.utils.parsedate_tz(date_str)
810 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
814 def determine_ext(url, default_ext='unknown_video'):
817 guess = url.partition('?')[0].rpartition('.')[2]
818 if re.match(r'^[A-Za-z0-9]+$', guess):
824 def subtitles_filename(filename, sub_lang, sub_format):
825 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
828 def date_from_str(date_str):
830 Return a datetime object from a string in the format YYYYMMDD or
831 (now|today)[+-][0-9](day|week|month|year)(s)?"""
832 today = datetime.date.today()
833 if date_str in ('now', 'today'):
835 if date_str == 'yesterday':
836 return today - datetime.timedelta(days=1)
837 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
838 if match is not None:
839 sign = match.group('sign')
840 time = int(match.group('time'))
843 unit = match.group('unit')
844 # A bad aproximation?
852 delta = datetime.timedelta(**{unit: time})
854 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
857 def hyphenate_date(date_str):
859 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
860 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
861 if match is not None:
862 return '-'.join(match.groups())
867 class DateRange(object):
868 """Represents a time interval between two dates"""
870 def __init__(self, start=None, end=None):
871 """start and end must be strings in the format accepted by date"""
872 if start is not None:
873 self.start = date_from_str(start)
875 self.start = datetime.datetime.min.date()
877 self.end = date_from_str(end)
879 self.end = datetime.datetime.max.date()
880 if self.start > self.end:
881 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
885 """Returns a range that only contains the given day"""
888 def __contains__(self, date):
889 """Check if the date is in the range"""
890 if not isinstance(date, datetime.date):
891 date = date_from_str(date)
892 return self.start <= date <= self.end
895 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
899 """ Returns the platform name as a compat_str """
900 res = platform.platform()
901 if isinstance(res, bytes):
902 res = res.decode(preferredencoding())
904 assert isinstance(res, compat_str)
908 def _windows_write_string(s, out):
909 """ Returns True if the string was written using special methods,
910 False if it has yet to be written out."""
911 # Adapted from http://stackoverflow.com/a/3259271/35070
914 import ctypes.wintypes
922 fileno = out.fileno()
923 except AttributeError:
924 # If the output stream doesn't have a fileno, it's virtual
926 except io.UnsupportedOperation:
927 # Some strange Windows pseudo files?
929 if fileno not in WIN_OUTPUT_IDS:
932 GetStdHandle = ctypes.WINFUNCTYPE(
933 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
934 (b"GetStdHandle", ctypes.windll.kernel32))
935 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
937 WriteConsoleW = ctypes.WINFUNCTYPE(
938 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
939 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
940 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
941 written = ctypes.wintypes.DWORD(0)
943 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
944 FILE_TYPE_CHAR = 0x0002
945 FILE_TYPE_REMOTE = 0x8000
946 GetConsoleMode = ctypes.WINFUNCTYPE(
947 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
948 ctypes.POINTER(ctypes.wintypes.DWORD))(
949 (b"GetConsoleMode", ctypes.windll.kernel32))
950 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
952 def not_a_console(handle):
953 if handle == INVALID_HANDLE_VALUE or handle is None:
955 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
956 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
961 def next_nonbmp_pos(s):
963 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
964 except StopIteration:
968 count = min(next_nonbmp_pos(s), 1024)
971 h, s, count if count else 2, ctypes.byref(written), None)
973 raise OSError('Failed to write string')
974 if not count: # We just wrote a non-BMP character
975 assert written.value == 2
978 assert written.value > 0
979 s = s[written.value:]
983 def write_string(s, out=None, encoding=None):
986 assert type(s) == compat_str
988 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
989 if _windows_write_string(s, out):
992 if ('b' in getattr(out, 'mode', '') or
993 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
994 byt = s.encode(encoding or preferredencoding(), 'ignore')
996 elif hasattr(out, 'buffer'):
997 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
998 byt = s.encode(enc, 'ignore')
999 out.buffer.write(byt)
1005 def bytes_to_intlist(bs):
1008 if isinstance(bs[0], int): # Python 3
1011 return [ord(c) for c in bs]
1014 def intlist_to_bytes(xs):
1017 return struct_pack('%dB' % len(xs), *xs)
1020 # Cross-platform file locking
1021 if sys.platform == 'win32':
1022 import ctypes.wintypes
1025 class OVERLAPPED(ctypes.Structure):
1027 ('Internal', ctypes.wintypes.LPVOID),
1028 ('InternalHigh', ctypes.wintypes.LPVOID),
1029 ('Offset', ctypes.wintypes.DWORD),
1030 ('OffsetHigh', ctypes.wintypes.DWORD),
1031 ('hEvent', ctypes.wintypes.HANDLE),
1034 kernel32 = ctypes.windll.kernel32
1035 LockFileEx = kernel32.LockFileEx
1036 LockFileEx.argtypes = [
1037 ctypes.wintypes.HANDLE, # hFile
1038 ctypes.wintypes.DWORD, # dwFlags
1039 ctypes.wintypes.DWORD, # dwReserved
1040 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1041 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1042 ctypes.POINTER(OVERLAPPED) # Overlapped
1044 LockFileEx.restype = ctypes.wintypes.BOOL
1045 UnlockFileEx = kernel32.UnlockFileEx
1046 UnlockFileEx.argtypes = [
1047 ctypes.wintypes.HANDLE, # hFile
1048 ctypes.wintypes.DWORD, # dwReserved
1049 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1050 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1051 ctypes.POINTER(OVERLAPPED) # Overlapped
1053 UnlockFileEx.restype = ctypes.wintypes.BOOL
1054 whole_low = 0xffffffff
1055 whole_high = 0x7fffffff
1057 def _lock_file(f, exclusive):
1058 overlapped = OVERLAPPED()
1059 overlapped.Offset = 0
1060 overlapped.OffsetHigh = 0
1061 overlapped.hEvent = 0
1062 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063 handle = msvcrt.get_osfhandle(f.fileno())
1064 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065 whole_low, whole_high, f._lock_file_overlapped_p):
1066 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1068 def _unlock_file(f):
1069 assert f._lock_file_overlapped_p
1070 handle = msvcrt.get_osfhandle(f.fileno())
1071 if not UnlockFileEx(handle, 0,
1072 whole_low, whole_high, f._lock_file_overlapped_p):
1073 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1078 def _lock_file(f, exclusive):
1079 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1081 def _unlock_file(f):
1082 fcntl.flock(f, fcntl.LOCK_UN)
1085 class locked_file(object):
1086 def __init__(self, filename, mode, encoding=None):
1087 assert mode in ['r', 'a', 'w']
1088 self.f = io.open(filename, mode, encoding=encoding)
1091 def __enter__(self):
1092 exclusive = self.mode != 'r'
1094 _lock_file(self.f, exclusive)
1100 def __exit__(self, etype, value, traceback):
1102 _unlock_file(self.f)
1109 def write(self, *args):
1110 return self.f.write(*args)
1112 def read(self, *args):
1113 return self.f.read(*args)
1116 def get_filesystem_encoding():
1117 encoding = sys.getfilesystemencoding()
1118 return encoding if encoding is not None else 'utf-8'
1121 def shell_quote(args):
1123 encoding = get_filesystem_encoding()
1125 if isinstance(a, bytes):
1126 # We may get a filename encoded with 'encodeFilename'
1127 a = a.decode(encoding)
1128 quoted_args.append(pipes.quote(a))
1129 return ' '.join(quoted_args)
1132 def smuggle_url(url, data):
1133 """ Pass additional data in a URL for internal use. """
1135 sdata = compat_urllib_parse.urlencode(
1136 {'__youtubedl_smuggle': json.dumps(data)})
1137 return url + '#' + sdata
1140 def unsmuggle_url(smug_url, default=None):
1141 if '#__youtubedl_smuggle' not in smug_url:
1142 return smug_url, default
1143 url, _, sdata = smug_url.rpartition('#')
1144 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1145 data = json.loads(jsond)
1149 def format_bytes(bytes):
1152 if type(bytes) is str:
1153 bytes = float(bytes)
1157 exponent = int(math.log(bytes, 1024.0))
1158 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1159 converted = float(bytes) / float(1024 ** exponent)
1160 return '%.2f%s' % (converted, suffix)
1163 def parse_filesize(s):
1167 # The lower-case forms are of course incorrect and inofficial,
1168 # but we support those too
1206 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1208 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1212 num_str = m.group('num').replace(',', '.')
1213 mult = _UNIT_TABLE[m.group('unit')]
1214 return int(float(num_str) * mult)
1217 def month_by_name(name):
1218 """ Return the number of a month by (locale-independently) English name """
1221 return ENGLISH_MONTH_NAMES.index(name) + 1
1226 def month_by_abbreviation(abbrev):
1227 """ Return the number of a month by (locale-independently) English
1231 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1236 def fix_xml_ampersands(xml_str):
1237 """Replace all the '&' by '&' in XML"""
1239 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1244 def setproctitle(title):
1245 assert isinstance(title, compat_str)
1247 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1250 title_bytes = title.encode('utf-8')
1251 buf = ctypes.create_string_buffer(len(title_bytes))
1252 buf.value = title_bytes
1254 libc.prctl(15, buf, 0, 0, 0)
1255 except AttributeError:
1256 return # Strange libc, just skip this
1259 def remove_start(s, start):
1260 if s.startswith(start):
1261 return s[len(start):]
1265 def remove_end(s, end):
1267 return s[:-len(end)]
1271 def url_basename(url):
1272 path = compat_urlparse.urlparse(url).path
1273 return path.strip('/').split('/')[-1]
1276 class HEADRequest(compat_urllib_request.Request):
1277 def get_method(self):
1281 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1284 v = getattr(v, get_attr, None)
1287 return default if v is None else (int(v) * invscale // scale)
1290 def str_or_none(v, default=None):
1291 return default if v is None else compat_str(v)
1294 def str_to_int(int_str):
1295 """ A more relaxed version of int_or_none """
1298 int_str = re.sub(r'[,\.\+]', '', int_str)
1302 def float_or_none(v, scale=1, invscale=1, default=None):
1303 return default if v is None else (float(v) * invscale / scale)
1306 def parse_duration(s):
1307 if not isinstance(s, compat_basestring):
1315 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1318 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1321 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1324 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1326 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1331 if m.group('only_mins'):
1332 return float_or_none(m.group('only_mins'), invscale=60)
1333 if m.group('only_hours'):
1334 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1336 res += int(m.group('secs'))
1337 if m.group('mins_reversed'):
1338 res += int(m.group('mins_reversed')) * 60
1340 res += int(m.group('mins')) * 60
1341 if m.group('hours'):
1342 res += int(m.group('hours')) * 60 * 60
1343 if m.group('hours_reversed'):
1344 res += int(m.group('hours_reversed')) * 60 * 60
1346 res += int(m.group('days')) * 24 * 60 * 60
1348 res += float(m.group('ms'))
1352 def prepend_extension(filename, ext, expected_real_ext=None):
1353 name, real_ext = os.path.splitext(filename)
1355 '{0}.{1}{2}'.format(name, ext, real_ext)
1356 if not expected_real_ext or real_ext[1:] == expected_real_ext
1357 else '{0}.{1}'.format(filename, ext))
1360 def replace_extension(filename, ext, expected_real_ext=None):
1361 name, real_ext = os.path.splitext(filename)
1362 return '{0}.{1}'.format(
1363 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1367 def check_executable(exe, args=[]):
1368 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1369 args can be a list of arguments for a short output (like -version) """
1371 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1377 def get_exe_version(exe, args=['--version'],
1378 version_re=None, unrecognized='present'):
1379 """ Returns the version of the specified executable,
1380 or False if the executable is not present """
1382 out, _ = subprocess.Popen(
1384 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1387 if isinstance(out, bytes): # Python 2.x
1388 out = out.decode('ascii', 'ignore')
1389 return detect_exe_version(out, version_re, unrecognized)
1392 def detect_exe_version(output, version_re=None, unrecognized='present'):
1393 assert isinstance(output, compat_str)
1394 if version_re is None:
1395 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1396 m = re.search(version_re, output)
1403 class PagedList(object):
1405 # This is only useful for tests
1406 return len(self.getslice())
1409 class OnDemandPagedList(PagedList):
1410 def __init__(self, pagefunc, pagesize):
1411 self._pagefunc = pagefunc
1412 self._pagesize = pagesize
1414 def getslice(self, start=0, end=None):
1416 for pagenum in itertools.count(start // self._pagesize):
1417 firstid = pagenum * self._pagesize
1418 nextfirstid = pagenum * self._pagesize + self._pagesize
1419 if start >= nextfirstid:
1422 page_results = list(self._pagefunc(pagenum))
1425 start % self._pagesize
1426 if firstid <= start < nextfirstid
1430 ((end - 1) % self._pagesize) + 1
1431 if (end is not None and firstid <= end <= nextfirstid)
1434 if startv != 0 or endv is not None:
1435 page_results = page_results[startv:endv]
1436 res.extend(page_results)
1438 # A little optimization - if current page is not "full", ie. does
1439 # not contain page_size videos then we can assume that this page
1440 # is the last one - there are no more ids on further pages -
1441 # i.e. no need to query again.
1442 if len(page_results) + startv < self._pagesize:
1445 # If we got the whole page, but the next page is not interesting,
1446 # break out early as well
1447 if end == nextfirstid:
1452 class InAdvancePagedList(PagedList):
1453 def __init__(self, pagefunc, pagecount, pagesize):
1454 self._pagefunc = pagefunc
1455 self._pagecount = pagecount
1456 self._pagesize = pagesize
1458 def getslice(self, start=0, end=None):
1460 start_page = start // self._pagesize
1462 self._pagecount if end is None else (end // self._pagesize + 1))
1463 skip_elems = start - start_page * self._pagesize
1464 only_more = None if end is None else end - start
1465 for pagenum in range(start_page, end_page):
1466 page = list(self._pagefunc(pagenum))
1468 page = page[skip_elems:]
1470 if only_more is not None:
1471 if len(page) < only_more:
1472 only_more -= len(page)
1474 page = page[:only_more]
1481 def uppercase_escape(s):
1482 unicode_escape = codecs.getdecoder('unicode_escape')
1484 r'\\U[0-9a-fA-F]{8}',
1485 lambda m: unicode_escape(m.group(0))[0],
1489 def escape_rfc3986(s):
1490 """Escape non-ASCII characters as suggested by RFC 3986"""
1491 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1492 s = s.encode('utf-8')
1493 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1496 def escape_url(url):
1497 """Escape URL as suggested by RFC 3986"""
1498 url_parsed = compat_urllib_parse_urlparse(url)
1499 return url_parsed._replace(
1500 path=escape_rfc3986(url_parsed.path),
1501 params=escape_rfc3986(url_parsed.params),
1502 query=escape_rfc3986(url_parsed.query),
1503 fragment=escape_rfc3986(url_parsed.fragment)
1507 struct.pack('!I', 0)
1509 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1510 def struct_pack(spec, *args):
1511 if isinstance(spec, compat_str):
1512 spec = spec.encode('ascii')
1513 return struct.pack(spec, *args)
1515 def struct_unpack(spec, *args):
1516 if isinstance(spec, compat_str):
1517 spec = spec.encode('ascii')
1518 return struct.unpack(spec, *args)
1520 struct_pack = struct.pack
1521 struct_unpack = struct.unpack
1524 def read_batch_urls(batch_fd):
1526 if not isinstance(url, compat_str):
1527 url = url.decode('utf-8', 'replace')
1528 BOM_UTF8 = '\xef\xbb\xbf'
1529 if url.startswith(BOM_UTF8):
1530 url = url[len(BOM_UTF8):]
1532 if url.startswith(('#', ';', ']')):
1536 with contextlib.closing(batch_fd) as fd:
1537 return [url for url in map(fixup, fd) if url]
1540 def urlencode_postdata(*args, **kargs):
1541 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1545 etree_iter = xml.etree.ElementTree.Element.iter
1546 except AttributeError: # Python <=2.6
1547 etree_iter = lambda n: n.findall('.//*')
1551 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1552 def doctype(self, name, pubid, system):
1553 pass # Ignore doctypes
1555 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1556 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1557 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1558 # Fix up XML parser in Python 2.x
1559 if sys.version_info < (3, 0):
1560 for n in etree_iter(tree):
1561 if n.text is not None:
1562 if not isinstance(n.text, compat_str):
1563 n.text = n.text.decode('utf-8')
1576 def parse_age_limit(s):
1579 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1580 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1583 def strip_jsonp(code):
1585 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1588 def js_to_json(code):
1591 if v in ('true', 'false', 'null'):
1593 if v.startswith('"'):
1595 if v.startswith("'"):
1597 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1604 res = re.sub(r'''(?x)
1605 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1606 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1607 [a-zA-Z_][.a-zA-Z_0-9]*
1609 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1613 def qualities(quality_ids):
1614 """ Get a numeric quality value out of a list of possible values """
1617 return quality_ids.index(qid)
1623 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1626 def limit_length(s, length):
1627 """ Add ellipses to overly long strings """
1632 return s[:length - len(ELLIPSES)] + ELLIPSES
1636 def version_tuple(v):
1637 return tuple(int(e) for e in re.split(r'[-.]', v))
1640 def is_outdated_version(version, limit, assume_new=True):
1642 return not assume_new
1644 return version_tuple(version) < version_tuple(limit)
1646 return not assume_new
1649 def ytdl_is_updateable():
1650 """ Returns if youtube-dl can be updated with -U """
1651 from zipimport import zipimporter
1653 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1656 def args_to_str(args):
1657 # Get a short string representation for a subprocess command
1658 return ' '.join(shlex_quote(a) for a in args)
1661 def mimetype2ext(mt):
1662 _, _, res = mt.rpartition('/')
1666 'x-mp4-fragmented': 'mp4',
1670 def urlhandle_detect_ext(url_handle):
1673 getheader = lambda h: url_handle.headers[h]
1674 except AttributeError: # Python < 3
1675 getheader = url_handle.info().getheader
1677 cd = getheader('Content-Disposition')
1679 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1681 e = determine_ext(m.group('filename'), default_ext=None)
1685 return mimetype2ext(getheader('Content-Type'))
1688 def age_restricted(content_limit, age_limit):
1689 """ Returns True iff the content should be blocked """
1691 if age_limit is None: # No limit set
1693 if content_limit is None:
1694 return False # Content available for everyone
1695 return age_limit < content_limit
1698 def is_html(first_bytes):
1699 """ Detect whether a file contains HTML by examining its first bytes. """
1702 (b'\xef\xbb\xbf', 'utf-8'),
1703 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1704 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1705 (b'\xff\xfe', 'utf-16-le'),
1706 (b'\xfe\xff', 'utf-16-be'),
1708 for bom, enc in BOMS:
1709 if first_bytes.startswith(bom):
1710 s = first_bytes[len(bom):].decode(enc, 'replace')
1713 s = first_bytes.decode('utf-8', 'replace')
1715 return re.match(r'^\s*<', s)
1718 def determine_protocol(info_dict):
1719 protocol = info_dict.get('protocol')
1720 if protocol is not None:
1723 url = info_dict['url']
1724 if url.startswith('rtmp'):
1726 elif url.startswith('mms'):
1728 elif url.startswith('rtsp'):
1731 ext = determine_ext(url)
1737 return compat_urllib_parse_urlparse(url).scheme
1740 def render_table(header_row, data):
1741 """ Render a list of rows, each as a list of values """
1742 table = [header_row] + data
1743 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1744 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1745 return '\n'.join(format_str % tuple(row) for row in table)
1748 def _match_one(filter_part, dct):
1749 COMPARISON_OPERATORS = {
1757 operator_rex = re.compile(r'''(?x)\s*
1759 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1761 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1762 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1765 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1766 m = operator_rex.search(filter_part)
1768 op = COMPARISON_OPERATORS[m.group('op')]
1769 if m.group('strval') is not None:
1770 if m.group('op') not in ('=', '!='):
1772 'Operator %s does not support string values!' % m.group('op'))
1773 comparison_value = m.group('strval')
1776 comparison_value = int(m.group('intval'))
1778 comparison_value = parse_filesize(m.group('intval'))
1779 if comparison_value is None:
1780 comparison_value = parse_filesize(m.group('intval') + 'B')
1781 if comparison_value is None:
1783 'Invalid integer value %r in filter part %r' % (
1784 m.group('intval'), filter_part))
1785 actual_value = dct.get(m.group('key'))
1786 if actual_value is None:
1787 return m.group('none_inclusive')
1788 return op(actual_value, comparison_value)
1791 '': lambda v: v is not None,
1792 '!': lambda v: v is None,
1794 operator_rex = re.compile(r'''(?x)\s*
1795 (?P<op>%s)\s*(?P<key>[a-z_]+)
1797 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1798 m = operator_rex.search(filter_part)
1800 op = UNARY_OPERATORS[m.group('op')]
1801 actual_value = dct.get(m.group('key'))
1802 return op(actual_value)
1804 raise ValueError('Invalid filter part %r' % filter_part)
1807 def match_str(filter_str, dct):
1808 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1811 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1814 def match_filter_func(filter_str):
1815 def _match_func(info_dict):
1816 if match_str(filter_str, info_dict):
1819 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1820 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1824 def parse_dfxp_time_expr(time_expr):
1828 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1830 return float(mobj.group('time_offset'))
1832 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1834 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1837 def format_srt_time(seconds):
1838 (mins, secs) = divmod(seconds, 60)
1839 (hours, mins) = divmod(mins, 60)
1840 millisecs = (secs - int(secs)) * 1000
1842 return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1845 def dfxp2srt(dfxp_data):
1846 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1848 def parse_node(node):
1849 str_or_empty = functools.partial(str_or_none, default='')
1851 out = str_or_empty(node.text)
1854 if child.tag == _x('ttml:br'):
1855 out += '\n' + str_or_empty(child.tail)
1856 elif child.tag == _x('ttml:span'):
1857 out += str_or_empty(parse_node(child))
1859 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1863 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1865 paras = dfxp.findall(_x('.//ttml:p'))
1867 for para, index in zip(paras, itertools.count(1)):
1868 out.append('%d\n%s --> %s\n%s\n\n' % (
1870 format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1871 format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1877 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1878 def __init__(self, proxies=None):
1879 # Set default handlers
1880 for type in ('http', 'https'):
1881 setattr(self, '%s_open' % type,
1882 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1883 meth(r, proxy, type))
1884 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1886 def proxy_open(self, req, proxy, type):
1887 req_proxy = req.headers.get('Ytdl-request-proxy')
1888 if req_proxy is not None:
1890 del req.headers['Ytdl-request-proxy']
1892 if proxy == '__noproxy__':
1893 return None # No Proxy
1894 return compat_urllib_request.ProxyHandler.proxy_open(
1895 self, req, proxy, type)