X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2FInfoExtractors.py;h=ee8783ffd6bd3ad5ce72f4c93216e25f415ea07b;hb=2c288bda4235bed6927d88d9bf53ecaec18f7904;hp=01201a145404deadfb74066d47a7296bf3ad2b5b;hpb=3fe294e4ef96317c61398707ed65a9e3f1c281c4;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 01201a145..ee8783ffd 100644 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -39,7 +39,6 @@ class InfoExtractor(object): url: Final video URL. uploader: Nickname of the video uploader. title: Literal title. - stitle: Simplified title. ext: Video filename extension. format: Video format. player_url: SWF Player URL (may be None). @@ -327,10 +326,6 @@ class YoutubeIE(InfoExtractor): return video_title = urllib.unquote_plus(video_info['title'][0]) video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - # simplified title - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail_url' not in video_info: @@ -359,33 +354,32 @@ class YoutubeIE(InfoExtractor): # closed captions video_subtitles = None if self._downloader.params.get('writesubtitles', False): - self.report_video_subtitles_download(video_id) - request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) - if srt_lang_list: - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = srt_lang_list[0] - if not srt_lang in srt_lang_list: - self._downloader.trouble(u'WARNING: no closed captions found in the specified language') - else: - request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) - try: - srt_xml = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) - else: - video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + if not srt_lang_list: + raise Trouble(u'WARNING: video has no closed captions') + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' else: - self._downloader.trouble(u'WARNING: video has no closed captions') + srt_lang = srt_lang_list[0] + if not srt_lang in srt_lang_list: + raise Trouble(u'WARNING: no closed captions found in the specified language') + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + except Trouble as trouble: + self._downloader.trouble(trouble[0]) # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -448,7 +442,6 @@ class YoutubeIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), @@ -524,8 +517,6 @@ class MetacafeIE(InfoExtractor): self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)]) return - simple_title = mobj.group(2).decode('utf-8') - # Retrieve video webpage to extract further information request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id) try: @@ -571,7 +562,6 @@ class MetacafeIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) mobj = re.search(r'(?ms)By:\s*(.+?)<', webpage) if mobj is None: @@ -585,7 +575,6 @@ class MetacafeIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -652,8 +641,6 @@ class DailymotionIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = unescapeHTML(mobj.group('title').decode('utf-8')) - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) if mobj is None: @@ -667,7 +654,6 @@ class DailymotionIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -731,8 +717,6 @@ class GoogleIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # Extract video description mobj = re.search(r'([^<]*)', webpage) @@ -765,7 +749,6 @@ class GoogleIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -824,8 +807,6 @@ class PhotobucketIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) video_uploader = mobj.group(2).decode('utf-8') @@ -835,7 +816,6 @@ class PhotobucketIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -913,7 +893,6 @@ class YahooIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract video title') return video_title = mobj.group(1).decode('utf-8') - simple_title = simplify_title(video_title) mobj = re.search(r'

(.*)

', webpage) if mobj is None: @@ -979,7 +958,6 @@ class YahooIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, @@ -1039,7 +1017,6 @@ class VimeoIE(InfoExtractor): # Extract title video_title = config["video"]["title"] - simple_title = simplify_title(video_title) # Extract uploader video_uploader = config["video"]["owner"]["name"] @@ -1085,7 +1062,6 @@ class VimeoIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': video_upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension, 'thumbnail': video_thumbnail, 'description': video_description, @@ -1220,8 +1196,6 @@ class GenericIE(InfoExtractor): self._downloader.trouble(u'ERROR: unable to extract title') return video_title = mobj.group(1).decode('utf-8') - video_title = sanitize_title(video_title) - simple_title = simplify_title(video_title) # video uploader is domain name mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) @@ -1236,7 +1210,6 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -1701,7 +1674,6 @@ class DepositFilesIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': file_title, - 'stitle': file_title, 'ext': file_extension.decode('utf-8'), 'format': u'NA', 'player_url': None, @@ -1846,9 +1818,6 @@ class FacebookIE(InfoExtractor): return video_title = video_info['title'] video_title = video_title.decode('utf-8') - video_title = sanitize_title(video_title) - - simple_title = simplify_title(video_title) # thumbnail image if 'thumbnail' not in video_info: @@ -1909,7 +1878,6 @@ class FacebookIE(InfoExtractor): 'uploader': video_uploader.decode('utf-8'), 'upload_date': upload_date, 'title': video_title, - 'stitle': simple_title, 'ext': video_extension.decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': video_thumbnail.decode('utf-8'), @@ -1959,7 +1927,6 @@ class BlipTVIE(InfoExtractor): 'id': title, 'url': url, 'title': title, - 'stitle': simplify_title(title), 'ext': ext, 'urlhandle': urlh } @@ -1993,7 +1960,6 @@ class BlipTVIE(InfoExtractor): 'uploader': data['display_name'], 'upload_date': upload_date, 'title': data['title'], - 'stitle': simplify_title(data['title']), 'ext': ext, 'format': data['media']['mimeType'], 'thumbnail': data['thumbnailUrl'], @@ -2055,9 +2021,6 @@ class MyVideoIE(InfoExtractor): return video_title = mobj.group(1) - video_title = sanitize_title(video_title) - - simple_title = simplify_title(video_title) return [{ 'id': video_id, @@ -2065,7 +2028,6 @@ class MyVideoIE(InfoExtractor): 'uploader': u'NA', 'upload_date': u'NA', 'title': video_title, - 'stitle': simple_title, 'ext': u'flv', 'format': u'NA', 'player_url': None, @@ -2192,7 +2154,6 @@ class ComedyCentralIE(InfoExtractor): 'uploader': showId, 'upload_date': officialDate, 'title': effTitle, - 'stitle': simplify_title(effTitle), 'ext': 'mp4', 'format': format, 'thumbnail': None, @@ -2266,7 +2227,6 @@ class EscapistIE(InfoExtractor): 'uploader': showName, 'upload_date': None, 'title': showName, - 'stitle': simplify_title(showName), 'ext': 'flv', 'format': 'flv', 'thumbnail': imgUrl, @@ -2330,7 +2290,6 @@ class CollegeHumorIE(InfoExtractor): videoNode = mdoc.findall('./video')[0] info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text - info['stitle'] = simplify_title(info['title']) info['url'] = videoNode.findall('./file')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text info['ext'] = info['url'].rpartition('.')[2] @@ -2404,7 +2363,6 @@ class XVideosIE(InfoExtractor): 'uploader': None, 'upload_date': None, 'title': video_title, - 'stitle': simplify_title(video_title), 'ext': 'flv', 'format': 'flv', 'thumbnail': video_thumbnail, @@ -2448,7 +2406,7 @@ class SoundcloudIE(InfoExtractor): uploader = mobj.group(1).decode('utf-8') # extract simple title (uploader + slug of song title) slug_title = mobj.group(2).decode('utf-8') - simple_title = uploader + '-' + slug_title + simple_title = uploader + u'-' + slug_title self.report_webpage('%s/%s' % (uploader, slug_title)) @@ -2470,7 +2428,9 @@ class SoundcloudIE(InfoExtractor): # extract unsimplified title mobj = re.search('"title":"(.*?)",', webpage) if mobj: - title = mobj.group(1) + title = mobj.group(1).decode('utf-8') + else: + title = simple_title # construct media url (with uid/token) mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s" @@ -2499,8 +2459,7 @@ class SoundcloudIE(InfoExtractor): 'url': mediaURL, 'uploader': uploader.decode('utf-8'), 'upload_date': upload_date, - 'title': simple_title.decode('utf-8'), - 'stitle': simple_title.decode('utf-8'), + 'title': title, 'ext': u'mp3', 'format': u'NA', 'player_url': None, @@ -2570,7 +2529,6 @@ class InfoQIE(InfoExtractor): 'uploader': None, 'upload_date': None, 'title': video_title, - 'stitle': simplify_title(video_title), 'ext': extension, 'format': extension, # Extension is always(?) mp4, but seems to be flv 'thumbnail': None, @@ -2686,7 +2644,6 @@ class MixcloudIE(InfoExtractor): 'uploader': uploader.decode('utf-8'), 'upload_date': u'NA', 'title': json_data['name'], - 'stitle': simplify_title(json_data['name']), 'ext': file_url.split('.')[-1].decode('utf-8'), 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), 'thumbnail': json_data['thumbnail_url'], @@ -2718,7 +2675,7 @@ class StanfordOpenClassroomIE(InfoExtractor): course = mobj.group('course') video = mobj.group('video') info = { - 'id': simplify_title(course + '_' + video), + 'id': course + '_' + video, } self.report_extraction(info['id']) @@ -2736,14 +2693,13 @@ class StanfordOpenClassroomIE(InfoExtractor): except IndexError: self._downloader.trouble(u'\nERROR: Invalid metadata XML file') return - info['stitle'] = simplify_title(info['title']) info['ext'] = info['url'].rpartition('.')[2] info['format'] = info['ext'] return [info] elif mobj.group('course'): # A course page course = mobj.group('course') info = { - 'id': simplify_title(course), + 'id': course, 'type': 'playlist', } @@ -2759,7 +2715,6 @@ class StanfordOpenClassroomIE(InfoExtractor): info['title'] = unescapeHTML(m.group(1)) else: info['title'] = info['id'] - info['stitle'] = simplify_title(info['title']) m = re.search('([^<]+)', coursepage) if m: @@ -2793,7 +2748,6 @@ class StanfordOpenClassroomIE(InfoExtractor): return info['title'] = info['id'] - info['stitle'] = simplify_title(info['title']) links = orderedSet(re.findall('', rootpage)) info['list'] = [ @@ -2892,7 +2846,6 @@ class MTVIE(InfoExtractor): 'url': video_url, 'uploader': performer, 'title': video_title, - 'stitle': simplify_title(video_title), 'ext': ext, 'format': format, }