X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=a5df62bf81ce0336ff4d641f8bcc93a1c27ff1c8;hb=0f00efed4c06fefcd4da7294cb3c92bccf081eaa;hp=4ace22c2fc232ecacef491fd6ac6ecbd0ca3df01;hpb=33d94a6c999ae784be7529aaaea42adadeab0c27;p=youtube-dl.git diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 4ace22c2f..a5df62bf8 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,21 +2,64 @@ # -*- coding: utf-8 -*- import gzip -import htmlentitydefs -import HTMLParser +import io import locale import os import re import sys import zlib -import urllib2 import email.utils import json try: - import cStringIO as StringIO -except ImportError: - import StringIO + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib as compat_urllib_parse + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + +try: + import html.entities as compat_html_entities +except ImportError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: + import html.parser as compat_html_parser +except ImportError: # Python 2 + import HTMLParser as compat_html_parser + +try: + import http.client as compat_http_client +except ImportError: # Python 2 + import httplib as compat_http_client + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + from urlparse import parse_qs as compat_parse_qs + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', @@ -25,31 +68,30 @@ std_headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } - -try: - compat_str = unicode # Python 2 -except NameError: - compat_str = str - def preferredencoding(): """Get preferred encoding. Returns the best encoding scheme for the system, based on locale.getpreferredencoding() and some further tweaks. """ - def yield_preferredencoding(): - try: - pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: - pref = 'UTF-8' - while True: - yield pref - return yield_preferredencoding().next() + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' + + return pref +if sys.version_info < (3,0): + def compat_print(s): + print(s.encode(preferredencoding(), 'xmlcharrefreplace')) +else: + def compat_print(s): + assert type(s) == type(u'') + print(s) def htmlentity_transform(matchobj): - """Transforms an HTML entity to a Unicode character. + """Transforms an HTML entity to a character. This function receives a match object and is intended to be used with the re.sub() function. @@ -57,11 +99,10 @@ def htmlentity_transform(matchobj): entity = matchobj.group(1) # Known non-numeric HTML entity - if entity in htmlentitydefs.name2codepoint: - return unichr(htmlentitydefs.name2codepoint[entity]) + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) - # Unicode character - mobj = re.match(ur'(?u)#(x?\d+)', entity) + mobj = re.match(u'(?u)#(x?\\d+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith(u'x'): @@ -69,13 +110,13 @@ def htmlentity_transform(matchobj): numstr = u'0%s' % numstr else: base = 10 - return unichr(long(numstr, base)) + return compat_chr(int(numstr, base)) # Unknown entity in name, return its literal representation return (u'&%s;' % entity) -HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(HTMLParser.HTMLParser): +compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix +class IDParser(compat_html_parser.HTMLParser): """Modified HTMLParser that isolates a tag with the specified id""" def __init__(self, id): self.id = id @@ -85,11 +126,11 @@ class IDParser(HTMLParser.HTMLParser): self.html = None self.watch_startpos = False self.error_count = 0 - HTMLParser.HTMLParser.__init__(self) + compat_html_parser.HTMLParser.__init__(self) def error(self, message): if self.error_count > 10 or self.started: - raise HTMLParser.HTMLParseError(message, self.getpos()) + raise compat_html_parser.HTMLParseError(message, self.getpos()) self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line self.error_count += 1 self.goahead(1) @@ -128,8 +169,10 @@ class IDParser(HTMLParser.HTMLParser): handle_decl = handle_pi = unknown_decl = find_startpos def get_result(self): - if self.result == None: return None - if len(self.result) != 3: return None + if self.result is None: + return None + if len(self.result) != 3: + return None lines = self.html.split('\n') lines = lines[self.result[1][0]-1:self.result[2][0]] lines[0] = lines[0][self.result[1][1]:] @@ -143,7 +186,7 @@ def get_element_by_id(id, html): parser = IDParser(id) try: parser.loads(html) - except HTMLParser.HTMLParseError: + except compat_html_parser.HTMLParseError: pass return parser.get_result() @@ -178,9 +221,9 @@ def sanitize_open(filename, open_mode): return (sys.stdout, filename) stream = open(encodeFilename(filename), open_mode) return (stream, filename) - except (IOError, OSError), err: + except (IOError, OSError) as err: # In case of error, try to remove win32 forbidden chars - filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) + filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename) # An exception here should be caught in the caller stream = open(encodeFilename(filename), open_mode) @@ -208,7 +251,7 @@ def sanitize_filename(s, restricted=False): return '_-' if restricted else ' -' elif char in '\\/|*<>': return '_' - if restricted and (char in '&\'' or char.isspace()): + if restricted and (char in '!&\'' or char.isspace()): return '_' if restricted and ord(char) > 127: return '_' @@ -235,20 +278,24 @@ def orderedSet(iterable): def unescapeHTML(s): """ - @param s a string (of type unicode) + @param s a string """ assert type(s) == type(u'') - result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) return result def encodeFilename(s): """ - @param s The name of the file (of type unicode) + @param s The name of the file """ assert type(s) == type(u'') + # Python 3 has a Unicode API + if sys.version_info >= (3, 0): + return s + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # Pass u'' directly to use Unicode APIs on Windows 2000 and up # (Detecting Windows NT 4 is tricky because 'major >= 4' would @@ -316,12 +363,12 @@ class ContentTooShortError(Exception): class Trouble(Exception): """Trouble helper exception - + This is an exception to be handled with FileDownloader.trouble """ -class YoutubeDLHandler(urllib2.HTTPHandler): +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -348,9 +395,9 @@ class YoutubeDLHandler(urllib2.HTTPHandler): @staticmethod def addinfourl_wrapper(stream, headers, url, code): - if hasattr(urllib2.addinfourl, 'getcode'): - return urllib2.addinfourl(stream, headers, url, code) - ret = urllib2.addinfourl(stream, headers, url) + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) ret.code = code return ret @@ -369,12 +416,12 @@ class YoutubeDLHandler(urllib2.HTTPHandler): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') + gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # deflate if resp.headers.get('Content-encoding', '') == 'deflate': - gz = StringIO.StringIO(self.deflate(resp.read())) + gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp