X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2F__init__.py;h=5a595901ccc1e8d36a9b24c00966104e05515ebf;hb=6af22cf0efe393825534e19f82e3282a53625d19;hp=a1871ca1c2a17920de98ca0e13c4f188fa396c84;hpb=4a34b7252e9909619fe2ba3ba01c2f9471681a81;p=youtube-dl.git diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index a1871ca1c..5a595901c 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -15,6 +15,7 @@ __authors__ = ( 'Kevin Ngo', 'Ori Avtalion', 'shizeeg', + 'Filippo Valsorda', ) __license__ = 'Public Domain' @@ -490,6 +491,8 @@ class FileDownloader(object): updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file + writesubtitles: Write the video subtitles to a .srt file + subtitleslang: Language of the subtitles to download """ params = None @@ -681,6 +684,10 @@ class FileDownloader(object): """ Report that the description file is being written """ self.to_screen(u'[info] Writing video description to: ' + descfn) + def report_writesubtitles(self, srtfn): + """ Report that the subtitles file is being written """ + self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) + def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) @@ -808,6 +815,21 @@ class FileDownloader(object): except (OSError, IOError): self.trouble(u'ERROR: Cannot write description file ' + descfn) return + + if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: + # subtitles download errors are already managed as troubles in relevant IE + # that way it will silently go on when used with unsupporting IE + try: + srtfn = filename.rsplit('.', 1)[0] + u'.srt' + self.report_writesubtitles(srtfn) + srtfile = open(_encodeFilename(srtfn), 'wb') + try: + srtfile.write(info_dict['subtitles'].encode('utf-8')) + finally: + srtfile.close() + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) + return if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' @@ -1206,6 +1228,10 @@ class YoutubeIE(InfoExtractor): """Report attempt to download video info webpage.""" self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id) + def report_video_subtitles_download(self, video_id): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id) @@ -1218,6 +1244,23 @@ class YoutubeIE(InfoExtractor): """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') + def _closed_captions_xml_to_srt(self, xml_string): + srt = '' + texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) + # TODO parse xml instead of regex + for n, (start, dur_tag, dur, caption) in enumerate(texts): + if not dur: dur = '4' + start = float(start) + end = start + float(dur) + start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) + end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) + caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) + caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional + srt += str(n) + '\n' + srt += start + ' --> ' + end + '\n' + srt += caption + '\n\n' + return srt + def _print_formats(self, formats): print 'Available formats:' for x in formats: @@ -1389,6 +1432,37 @@ class YoutubeIE(InfoExtractor): vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) # TODO use another parser + + # closed captions + video_subtitles = None + if self._downloader.params.get('writesubtitles', False): + self.report_video_subtitles_download(video_id) + request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) + try: + srt_list = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list) + if srt_lang_list: + if self._downloader.params.get('subtitleslang', False): + srt_lang = self._downloader.params.get('subtitleslang') + elif 'en' in srt_lang_list: + srt_lang = 'en' + else: + srt_lang = srt_lang_list[0] + if not srt_lang in srt_lang_list: + self._downloader.trouble(u'WARNING: no closed captions found in the specified language') + else: + request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id)) + try: + srt_xml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err)) + else: + video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8')) + else: + self._downloader.trouble(u'WARNING: video has no closed captions') # token video_token = urllib.unquote_plus(video_info['token'][0]) @@ -1461,6 +1535,7 @@ class YoutubeIE(InfoExtractor): 'thumbnail': video_thumbnail.decode('utf-8'), 'description': video_description, 'player_url': player_url, + 'subtitles': video_subtitles }) except UnavailableVideoError, err: self._downloader.trouble(u'\nERROR: unable to download video') @@ -2166,7 +2241,67 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id) + def report_following_redirect(self, new_url): + """Report information extraction.""" + self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) + + def _test_redirect(self, url): + """Check if it is a redirect, like url shorteners, in case restart chain.""" + class HeadRequest(urllib2.Request): + def get_method(self): + return "HEAD" + + class HEADRedirectHandler(urllib2.HTTPRedirectHandler): + """ + Subclass the HTTPRedirectHandler to make it use our + HeadRequest also on the redirected URL + """ + def redirect_request(self, req, fp, code, msg, headers, newurl): + if code in (301, 302, 303, 307): + newurl = newurl.replace(' ', '%20') + newheaders = dict((k,v) for k,v in req.headers.items() + if k.lower() not in ("content-length", "content-type")) + return HeadRequest(newurl, + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True) + else: + raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp) + + class HTTPMethodFallback(urllib2.BaseHandler): + """ + Fallback to GET if HEAD is not allowed (405 HTTP error) + """ + def http_error_405(self, req, fp, code, msg, headers): + fp.read() + fp.close() + + newheaders = dict((k,v) for k,v in req.headers.items() + if k.lower() not in ("content-length", "content-type")) + return self.parent.open(urllib2.Request(req.get_full_url(), + headers=newheaders, + origin_req_host=req.get_origin_req_host(), + unverifiable=True)) + + # Build our opener + opener = urllib2.OpenerDirector() + for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler, + HTTPMethodFallback, HEADRedirectHandler, + urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]: + opener.add_handler(handler()) + + response = opener.open(HeadRequest(url)) + new_url = response.geturl() + + if url == new_url: return False + + self.report_following_redirect(new_url) + self._downloader.download([new_url]) + return True + def _real_extract(self, url): + if self._test_redirect(url): return + # At this point we have a new video self._downloader.increment_downloads() @@ -2507,7 +2642,7 @@ class YoutubePlaylistIE(InfoExtractor): _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' - _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' + _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=PL%s&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None IE_NAME = u'youtube:playlist' @@ -2559,7 +2694,7 @@ class YoutubePlaylistIE(InfoExtractor): # Extract video identifiers ids_in_page = [] - for mobj in re.finditer(self._VIDEO_INDICATOR, page): + for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): if mobj.group(1) not in ids_in_page: ids_in_page.append(mobj.group(1)) video_ids.extend(ids_in_page) @@ -2570,7 +2705,10 @@ class YoutubePlaylistIE(InfoExtractor): playliststart = self._downloader.params.get('playliststart', 1) - 1 playlistend = self._downloader.params.get('playlistend', -1) - video_ids = video_ids[playliststart:playlistend] + if playlistend == -1: + video_ids = video_ids[playliststart:] + else: + video_ids = video_ids[playliststart:playlistend] for id in video_ids: self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) @@ -4316,6 +4454,12 @@ def parseOpts(): action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') video_format.add_option('-F', '--list-formats', action='store_true', dest='listformats', help='list all available formats (currently youtube only)') + video_format.add_option('--write-srt', + action='store_true', dest='writesubtitles', + help='write video closed captions to a .srt file (currently youtube only)', default=False) + video_format.add_option('--srt-lang', + action='store', dest='subtitleslang', metavar='LANG', + help='language of the closed captions to download (optional) use IETF language tags like \'en\'') verbosity.add_option('-q', '--quiet', @@ -4580,6 +4724,8 @@ def _real_main(): 'updatetime': opts.updatetime, 'writedescription': opts.writedescription, 'writeinfojson': opts.writeinfojson, + 'writesubtitles': opts.writesubtitles, + 'subtitleslang': opts.subtitleslang, 'matchtitle': opts.matchtitle, 'rejecttitle': opts.rejecttitle, 'max_downloads': opts.max_downloads,