X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=c9702fd93f59862fa5bb261f38b5f7ccb6977116;hb=51fb4995a5242c0edca09167cf8c4b050cf5a186;hp=4532b737be1bb564097079a4baedef0c02932bb4;hpb=ed0291d1533600b21903cb98f070791a20e47433;p=youtube-dl.git diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4532b737b..c9702fd93 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -14,8 +14,8 @@ import email.utils import errno import functools import gzip -import itertools import io +import itertools import json import locale import math @@ -24,9 +24,8 @@ import os import pipes import platform import re -import ssl import socket -import struct +import ssl import subprocess import sys import tempfile @@ -53,9 +52,22 @@ from .compat import ( compat_urlparse, compat_xpath, shlex_quote, + struct_pack, +) + +from .socks import ( + ProxyType, + sockssocket, ) +def register_socks_protocols(): + # "Register" SOCKS protocols + for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): + if scheme not in compat_urlparse.uses_netloc: + compat_urlparse.uses_netloc.append(scheme) + + # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) @@ -89,6 +101,11 @@ KNOWN_EXTENSIONS = ( 'wav', 'f4f', 'f4m', 'm3u8', 'smil') +# needed for sanitizing filenames in restricted mode +ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ', + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy'))) + def preferredencoding(): """Get preferred encoding. @@ -251,9 +268,9 @@ def get_element_by_attribute(attribute, value, html): m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? \s*> (?P.*?) @@ -365,6 +382,8 @@ def sanitize_filename(s, restricted=False, is_id=False): Set is_id if this is not an arbitrary string, but an ID that should be kept if possible """ def replace_insane(char): + if restricted and char in ACCENT_CHARS: + return ACCENT_CHARS[char] if char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': @@ -745,8 +764,15 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): self._params = params def http_open(self, req): + conn_class = compat_http_client.HTTPConnection + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + return self.do_open(functools.partial( - _create_http_connection, self, compat_http_client.HTTPConnection, False), + _create_http_connection, self, conn_class, False), req) @staticmethod @@ -778,12 +804,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): # Substitute URL if any change after escaping if url != url_escaped: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - new_req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - new_req.timeout = req.timeout - req = new_req + req = update_Request(req, url=url_escaped) for h, v in std_headers.items(): # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275 @@ -847,6 +868,43 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): https_response = http_response +def make_socks_conn_class(base_class, socks_proxy): + assert issubclass(base_class, ( + compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + + url_components = compat_urlparse.urlparse(socks_proxy) + if url_components.scheme.lower() == 'socks5': + socks_type = ProxyType.SOCKS5 + elif url_components.scheme.lower() in ('socks', 'socks4'): + socks_type = ProxyType.SOCKS4 + elif url_components.scheme.lower() == 'socks4a': + socks_type = ProxyType.SOCKS4A + + proxy_args = ( + socks_type, + url_components.hostname, url_components.port or 1080, + True, # Remote DNS + url_components.username, url_components.password + ) + + class SocksConnection(base_class): + def connect(self): + self.sock = sockssocket() + self.sock.setproxy(*proxy_args) + if type(self.timeout) in (int, float): + self.sock.settimeout(self.timeout) + self.sock.connect((self.host, self.port)) + + if isinstance(self, compat_http_client.HTTPSConnection): + if hasattr(self, '_context'): # Python > 2.6 + self.sock = self._context.wrap_socket( + self.sock, server_hostname=self.host) + else: + self.sock = ssl.wrap_socket(self.sock) + + return SocksConnection + + class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): def __init__(self, params, https_conn_class=None, *args, **kwargs): compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) @@ -855,12 +913,20 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): def https_open(self, req): kwargs = {} + conn_class = self._https_conn_class + if hasattr(self, '_context'): # python > 2.6 kwargs['context'] = self._context if hasattr(self, '_check_hostname'): # python 3.x kwargs['check_hostname'] = self._check_hostname + + socks_proxy = req.headers.get('Ytdl-socks-proxy') + if socks_proxy: + conn_class = make_socks_conn_class(conn_class, socks_proxy) + del req.headers['Ytdl-socks-proxy'] + return self.do_open(functools.partial( - _create_http_connection, self, self._https_conn_class, True), + _create_http_connection, self, conn_class, True), req, **kwargs) @@ -1545,44 +1611,46 @@ def parse_duration(s): s = s.strip() - m = re.match( - r'''(?ix)(?:P?T)? - (?: - (?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*| - (?P[0-9.]+)\s*(?:hours?)| - - \s*(?P[0-9]+)\s*(?:[:h]|hours?)\s*(?P[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*| - (?: + days, hours, mins, secs, ms = [None] * 5 + m = re.match(r'(?:(?:(?:(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+)(?P\.[0-9]+)?$', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match( + r'''(?ix)(?:P?T)? (?: - (?:(?P[0-9]+)\s*(?:[:d]|days?)\s*)? - (?P[0-9]+)\s*(?:[:h]|hours?)\s* + (?P[0-9]+)\s*d(?:ays?)?\s* )? - (?P[0-9]+)\s*(?:[:m]|mins?|minutes?)\s* - )? - (?P[0-9]+)(?P\.[0-9]+)?\s*(?:s|secs?|seconds?)? - )$''', s) - if not m: - return None - res = 0 - if m.group('only_mins'): - return float_or_none(m.group('only_mins'), invscale=60) - if m.group('only_hours'): - return float_or_none(m.group('only_hours'), invscale=60 * 60) - if m.group('secs'): - res += int(m.group('secs')) - if m.group('mins_reversed'): - res += int(m.group('mins_reversed')) * 60 - if m.group('mins'): - res += int(m.group('mins')) * 60 - if m.group('hours'): - res += int(m.group('hours')) * 60 * 60 - if m.group('hours_reversed'): - res += int(m.group('hours_reversed')) * 60 * 60 - if m.group('days'): - res += int(m.group('days')) * 24 * 60 * 60 - if m.group('ms'): - res += float(m.group('ms')) - return res + (?: + (?P[0-9]+)\s*h(?:ours?)?\s* + )? + (?: + (?P[0-9]+)\s*m(?:in(?:ute)?s?)?\s* + )? + (?: + (?P[0-9]+)(?P\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* + )?$''', s) + if m: + days, hours, mins, secs, ms = m.groups() + else: + m = re.match(r'(?i)(?:(?P[0-9.]+)\s*(?:hours?)|(?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s) + if m: + hours, mins = m.groups() + else: + return None + + duration = 0 + if secs: + duration += float(secs) + if mins: + duration += float(mins) * 60 + if hours: + duration += float(hours) * 60 * 60 + if days: + duration += float(days) * 24 * 60 * 60 + if ms: + duration += float(ms) + return duration def prepend_extension(filename, ext, expected_real_ext=None): @@ -1757,24 +1825,6 @@ def escape_url(url): fragment=escape_rfc3986(url_parsed.fragment) ).geturl() -try: - struct.pack('!I', 0) -except TypeError: - # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument - # See https://bugs.python.org/issue19099 - def struct_pack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.pack(spec, *args) - - def struct_unpack(spec, *args): - if isinstance(spec, compat_str): - spec = spec.encode('ascii') - return struct.unpack(spec, *args) -else: - struct_pack = struct.pack - struct_unpack = struct.unpack - def read_batch_urls(batch_fd): def fixup(url): @@ -1797,6 +1847,8 @@ def urlencode_postdata(*args, **kargs): def update_url_query(url, query): + if not query: + return url parsed_url = compat_urlparse.urlparse(url) qs = compat_parse_qs(parsed_url.query) qs.update(query) @@ -1936,6 +1988,9 @@ def error_to_compat_str(err): def mimetype2ext(mt): + if mt is None: + return None + ext = { 'audio/mp4': 'm4a', }.get(mt) @@ -2136,6 +2191,7 @@ def dfxp2srt(dfxp_data): _x = functools.partial(xpath_with_ns, ns_map={ 'ttml': 'http://www.w3.org/ns/ttml', 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1', }) class TTMLPElementParser(object): @@ -2162,7 +2218,7 @@ def dfxp2srt(dfxp_data): dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') @@ -2691,6 +2747,10 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy + if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + req.add_header('Ytdl-socks-proxy', proxy) + # youtube-dl's http/https handlers do wrapping the socket with socks + return None return compat_urllib_request.ProxyHandler.proxy_open( self, req, proxy, type)