X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=0fc39163ee2a74131ed84442c68f388a72fad0f1;hb=f36cd076850faf4b2859a168fcb740dfccb9eed6;hp=4314f14022b650cb965707ee5c90e07fbecd67c4;hpb=d11d05d07acdd11a93b02d750852dea4ae32be3b;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 4314f1402..0fc39163e 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -12,28 +12,14 @@ import time import urllib import urllib2 import email.utils +import xml.etree.ElementTree +from urlparse import parse_qs try: import cStringIO as StringIO except ImportError: import StringIO -# parse_qs was moved from the cgi module to the urlparse module recently. -try: - from urlparse import parse_qs -except ImportError: - from cgi import parse_qs - -try: - import lxml.etree -except ImportError: - pass # Handled below - -try: - import xml.etree.ElementTree -except ImportError: # Python<2.5: Not officially supported, but let it slip - warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.') - from utils import * @@ -53,7 +39,6 @@ class InfoExtractor(object): url: Final video URL. uploader: Nickname of the video uploader. title: Literal title. - stitle: Simplified title. ext: Video filename extension. format: Video format. player_url: SWF Player URL (may be None). @@ -117,8 +102,8 @@ class YoutubeIE(InfoExtractor): _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' _NETRC_MACHINE = 'youtube' # Listed in order of quality - _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] _video_extensions = { '13': '3gp', '17': 'mp4', @@ -129,6 +114,7 @@ class YoutubeIE(InfoExtractor): '43': 'webm', '44': 'webm', '45': 'webm', + '46': 'webm', } _video_dimensions = { '5': '240x400', @@ -144,6 +130,7 @@ class YoutubeIE(InfoExtractor): '43': '360x640', '44': '480x854', '45': '720x1280', + '46': '1080x1920', } IE_NAME = u'youtube' @@ -193,8 +180,8 @@ class YoutubeIE(InfoExtractor): end = start + float(dur) start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) - caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + caption = unescapeHTML(caption) + caption = unescapeHTML(caption) # double cycle, inentional srt += str(n) + '\n' srt += start + ' --> ' + end + '\n' srt += caption + '\n\n' @@ -339,10 +326,6 @@ class YoutubeIE(InfoExtractor): return video_title = urllib.unquote_plus(video_info['title'][0]) video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - # simplified title - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -364,49 +347,39 @@ class YoutubeIE(InfoExtractor): pass # description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_description = mobj.group(1).decode('utf-8') - else: - html_parser = lxml.etree.HTMLParser(encoding='utf-8') - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) - # TODO use another parser + video_description = get_element_by_id("eow-description", video_webpage.decode('utf8')) + if video_description: video_description = clean_html(video_description) + else: video_description = '' # closed captions video_subtitles = None if self._downloader.params.get('writesubtitles', False): - self.report_video_subtitles_download(video_id) - request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) - if srt_lang_list: - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = srt_lang_list[0] - if not srt_lang in srt_lang_list: - self._downloader.trouble(u'WARNING: no closed captions found in the specified language') - else: - request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) - try: - srt_xml = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: - video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + if not srt_lang_list: + raise Trouble(u'WARNING: video has no closed captions') + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' else: - self._downloader.trouble(u'WARNING: video has no closed captions') + srt_lang = srt_lang_list[0] + if not srt_lang in srt_lang_list: + raise Trouble(u'WARNING: no closed captions found in the specified language') + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + except Trouble as trouble: + self._downloader.trouble(trouble[0]) # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -469,7 +442,6 @@ class YoutubeIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), @@ -545,8 +517,6 @@ class MetacafeIE(InfoExtractor): self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) return - simple_title = mobj.group(2).decode('utf-8') - # Retrieve video webpage to extract further information request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) try: @@ -592,7 +562,6 @@ class MetacafeIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) if mobj is None: @@ -606,7 +575,6 @@ class MetacafeIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -673,8 +641,6 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: @@ -688,7 +654,6 @@ class DailymotionIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -752,8 +717,6 @@ class GoogleIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # Extract video description mobj = re.search(r'([^<]*)', webpage) @@ -786,7 +749,6 @@ class GoogleIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -845,8 +807,6 @@ class PhotobucketIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) video_uploader = mobj.group(2).decode('utf-8') @@ -856,7 +816,6 @@ class PhotobucketIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -934,7 +893,6 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = simplify_title(video_title) mobj = re.search(r'

(.*)

', webpage) if mobj is None: @@ -992,7 +950,7 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: Unable to extract media URL') return video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8') - video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url) + video_url = unescapeHTML(video_url) return [{ 'id': video_id.decode('utf-8'), @@ -1000,7 +958,6 @@ class YahooIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, @@ -1060,7 +1017,6 @@ class VimeoIE(InfoExtractor): # Extract title video_title = config["video"]["title"] - simple_title = simplify_title(video_title) # Extract uploader video_uploader = config["video"]["owner"]["name"] @@ -1069,18 +1025,9 @@ class VimeoIE(InfoExtractor): video_thumbnail = config["video"]["thumbnail"] # Extract video description - try: - lxml.etree - except NameError: - video_description = u'No description available.' - mobj = re.search(r'', webpage, re.MULTILINE) - if mobj is not None: - video_description = mobj.group(1) - else: - html_parser = lxml.etree.HTMLParser() - vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) - video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() - # TODO use another parser + video_description = get_element_by_id("description", webpage.decode('utf8')) + if video_description: video_description = clean_html(video_description) + else: video_description = '' # Extract upload date video_upload_date = u'NA' @@ -1115,7 +1062,6 @@ class VimeoIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, @@ -1250,8 +1196,6 @@ class GenericIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) @@ -1266,7 +1210,6 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -1731,7 +1674,6 @@ class DepositFilesIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': file_title, - 'stitle': file_title, 'ext': file_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -1876,9 +1818,6 @@ class FacebookIE(InfoExtractor): return video_title = video_info['title'] video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail' not in video_info: @@ -1939,7 +1878,6 @@ class FacebookIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), @@ -1989,7 +1927,6 @@ class BlipTVIE(InfoExtractor): 'id': title, 'url': url, 'title': title, - 'stitle': simplify_title(title), 'ext': ext, 'urlhandle': urlh } @@ -2009,21 +1946,20 @@ class BlipTVIE(InfoExtractor): data = json_data['Post'] else: data = json_data - + upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') video_url = data['media']['url'] umobj = re.match(self._URL_EXT, video_url) if umobj is None: raise ValueError('Can not determine filename extension') ext = umobj.group(1) - + info = { 'id': data['item_id'], 'url': video_url, 'uploader': data['display_name'], 'upload_date': upload_date, 'title': data['title'], - 'stitle': simplify_title(data['title']), 'ext': ext, 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], @@ -2085,9 +2021,6 @@ class MyVideoIE(InfoExtractor): return video_title = mobj.group(1) - video_title = sanitize_title(video_title) - - simple_title = simplify_title(video_title) return [{ 'id': video_id, @@ -2095,7 +2028,6 @@ class MyVideoIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': u'flv', 'format': u'NA', 'player_url': None, @@ -2109,7 +2041,7 @@ class ComedyCentralIE(InfoExtractor): def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) - + def report_config_download(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id) @@ -2222,7 +2154,6 @@ class ComedyCentralIE(InfoExtractor): 'uploader': showId, 'upload_date': officialDate, 'title': effTitle, - 'stitle': simplify_title(effTitle), 'ext': 'mp4', 'format': format, 'thumbnail': None, @@ -2248,8 +2179,6 @@ class EscapistIE(InfoExtractor): self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) def _real_extract(self, url): - htmlParser = HTMLParser.HTMLParser() - mobj = re.match(self._VALID_URL, url) if mobj is None: self._downloader.trouble(u'ERROR: invalid URL: %s' % url) @@ -2259,17 +2188,18 @@ class EscapistIE(InfoExtractor): self.report_extraction(showName) try: - webPage = urllib2.urlopen(url).read() + webPageBytes = urllib2.urlopen(url).read() except (urllib2.URLError, httplib.HTTPException, socket.error), err: self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err)) return + webPage = webPageBytes.decode('utf-8') descMatch = re.search('([^<]+)', coursepage) if m: @@ -2822,8 +2735,6 @@ class StanfordOpenClassroomIE(InfoExtractor): return results else: # Root page - unescapeHTML = HTMLParser.HTMLParser().unescape - info = { 'id': 'Stanford OpenClassroom', 'type': 'playlist', @@ -2838,7 +2749,6 @@ class StanfordOpenClassroomIE(InfoExtractor): return info['title'] = info['id'] - info['stitle'] = simplify_title(info['title']) links = orderedSet(re.findall('', rootpage)) info['list'] = [ @@ -2937,7 +2847,6 @@ class MTVIE(InfoExtractor): 'url': video_url, 'uploader': performer, 'title': video_title, - 'stitle': simplify_title(video_title), 'ext': ext, 'format': format, }