X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube-dl;h=2cddafcf54bb9bb2478928d9675080394cc57e0d;hb=af6a92f4c954d8f0e6628076f751d6ac9935a6d6;hp=bf6973480940cb5d3aaacbb571b2b8d92106b24f;hpb=79e75f66c88ee1de6018e518bcc1a33cd279f697;p=youtube-dl.git diff --git a/youtube-dl b/youtube-dl index bf6973480..2cddafcf5 100755 --- a/youtube-dl +++ b/youtube-dl @@ -18,7 +18,7 @@ import time import urllib import urllib2 -std_headers = { +std_headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.5) Gecko/2008120122 Firefox/3.0.5', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', @@ -435,6 +435,29 @@ class YoutubeIE(InfoExtractor): def suitable(url): return (re.match(YoutubeIE._VALID_URL, url) is not None) + @staticmethod + def htmlentity_transform(matchobj): + """Transforms an HTML entity to a Unicode character.""" + entity = matchobj.group(1) + + # Known non-numeric HTML entity + if entity in htmlentitydefs.name2codepoint: + return unichr(htmlentitydefs.name2codepoint[entity]) + + # Unicode character + mobj = re.match(ur'(?u)#(x?\d+)', entity) + if mobj is not None: + numstr = mobj.group(1) + if numstr.startswith(u'x'): + base = 16 + numstr = u'0%s' % numstr + else: + base = 10 + return unichr(long(numstr, base)) + + # Unknown entity in name, return its literal representation + return (u'&%s;' % entity) + def report_lang(self): """Report attempt to set language.""" self.to_stdout(u'[youtube] Setting language') @@ -458,7 +481,7 @@ class YoutubeIE(InfoExtractor): def report_video_url(self, video_id, video_real_url): """Report extracted video URL.""" self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url)) - + def _real_initialize(self): if self._downloader is None: return @@ -585,7 +608,7 @@ class YoutubeIE(InfoExtractor): self.to_stderr(u'ERROR: unable to extract video title') return [None] video_title = mobj.group(1).decode('utf-8') - video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title) + video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title) video_title = video_title.replace(os.sep, u'%') # simplified title @@ -729,7 +752,7 @@ class YoutubeSearchIE(InfoExtractor): _MORE_PAGES_INDICATOR = r'>Next' _youtube_ie = None - def __init__(self, youtube_ie, downloader=None): + def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) self._youtube_ie = youtube_ie @@ -752,11 +775,11 @@ class YoutubeSearchIE(InfoExtractor): prefix, query = query.split(':') prefix = prefix[8:] - if prefix == '': + if prefix == '': return self._download_n_results(query, 1) - elif prefix == 'all': + elif prefix == 'all': return self._download_n_results(query, -1) - else: + else: try: n = int(prefix) if n <= 0: