X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2F__init__.py;h=752d762ebc05fe45b7bd8a475d06ac19114a81b0;hb=c6f45d431482503e86f72a6f4f63d1090a312b28;hp=a1871ca1c2a17920de98ca0e13c4f188fa396c84;hpb=4a34b7252e9909619fe2ba3ba01c2f9471681a81;p=youtube-dl.git

diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index a1871ca1c..752d762eb 100755
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -15,6 +15,7 @@ __authors__  = (
 	'Kevin Ngo',
 	'Ori Avtalion',
 	'shizeeg',
+	'Filippo Valsorda',
 	)
 
 __license__ = 'Public Domain'
@@ -66,11 +67,6 @@ try:
 except ImportError:
 	from cgi import parse_qs
 
-try:
-	import lxml.etree
-except ImportError:
-	pass # Handled below
-
 try:
 	import xml.etree.ElementTree
 except ImportError: # Python<2.5: Not officially supported, but let it slip
@@ -197,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr
 				raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
 			return res
 
+
+class IDParser(HTMLParser.HTMLParser):
+	"""Modified HTMLParser that isolates a tag with the specified id"""
+	def __init__(self, id):
+		self.id = id
+		self.result = None
+		self.started = False
+		self.depth = {}
+		self.html = None
+		self.watch_startpos = False
+		HTMLParser.HTMLParser.__init__(self)
+
+	def loads(self, html):
+		self.html = html
+		self.feed(html)
+		self.close()
+
+	def handle_starttag(self, tag, attrs):
+		attrs = dict(attrs)
+		if self.started:
+			self.find_startpos(None)
+		if 'id' in attrs and attrs['id'] == self.id:
+			self.result = [tag]
+			self.started = True
+			self.watch_startpos = True
+		if self.started:
+			if not tag in self.depth: self.depth[tag] = 0
+			self.depth[tag] += 1
+
+	def handle_endtag(self, tag):
+		if self.started:
+			if tag in self.depth: self.depth[tag] -= 1
+			if self.depth[self.result[0]] == 0:
+				self.started = False
+				self.result.append(self.getpos())
+
+	def find_startpos(self, x):
+		"""Needed to put the start position of the result (self.result[1])
+		after the opening tag with the requested id"""
+		if self.watch_startpos:
+			self.watch_startpos = False
+			self.result.append(self.getpos())
+	handle_entityref = handle_charref = handle_data = handle_comment = \
+	handle_decl = handle_pi = unknown_decl = find_startpos
+
+	def get_result(self):
+		if self.result == None: return None
+		if len(self.result) != 3: return None
+		lines = self.html.split('\n')
+		lines = lines[self.result[1][0]-1:self.result[2][0]]
+		lines[0] = lines[0][self.result[1][1]:]
+		if len(lines) == 1:
+			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+		lines[-1] = lines[-1][:self.result[2][1]]
+		return '\n'.join(lines).strip()
+
+def get_element_by_id(id, html):
+	"""Return the content of the tag with the specified id in the passed HTML document"""
+	parser = IDParser(id)
+	parser.loads(html)
+	return parser.get_result()
+
+
 def preferredencoding():
 	"""Get preferred encoding.
 
@@ -241,6 +300,18 @@ def htmlentity_transform(matchobj):
 	return (u'&%s;' % entity)
 
 
+def clean_html(html):
+	"""Clean an HTML snippet into a readable string"""
+	# Newline vs <br />
+	html = html.replace('\n', ' ')
+	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+	# Strip html tags
+	html = re.sub('<.*?>', '', html)
+	# Replace html entities
+	html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
+	return html
+
+
 def sanitize_title(utitle):
 	"""Sanitizes a video title so it could be used as part of a filename."""
 	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
@@ -490,6 +561,8 @@ class FileDownloader(object):
 	updatetime:       Use the Last-modified header to set output file timestamps.
 	writedescription: Write the video description to a .description file
 	writeinfojson:    Write the video description to a .info.json file
+	writesubtitles:   Write the video subtitles to a .srt file
+	subtitleslang:    Language of the subtitles to download
 	"""
 
 	params = None
@@ -681,6 +754,10 @@ class FileDownloader(object):
 		""" Report that the description file is being written """
 		self.to_screen(u'[info] Writing video description to: ' + descfn)
 
+	def report_writesubtitles(self, srtfn):
+		""" Report that the subtitles file is being written """
+		self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
+
 	def report_writeinfojson(self, infofn):
 		""" Report that the metadata file has been written """
 		self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
@@ -808,6 +885,21 @@ class FileDownloader(object):
 			except (OSError, IOError):
 				self.trouble(u'ERROR: Cannot write description file ' + descfn)
 				return
+				
+		if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
+			# subtitles download errors are already managed as troubles in relevant IE
+			# that way it will silently go on when used with unsupporting IE 
+			try:
+				srtfn = filename.rsplit('.', 1)[0] + u'.srt'
+				self.report_writesubtitles(srtfn)
+				srtfile = open(_encodeFilename(srtfn), 'wb')
+				try:
+					srtfile.write(info_dict['subtitles'].encode('utf-8'))
+				finally:
+					srtfile.close()
+			except (OSError, IOError):
+				self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
+				return
 
 		if self.params.get('writeinfojson', False):
 			infofn = filename + u'.info.json'
@@ -1206,6 +1298,10 @@ class YoutubeIE(InfoExtractor):
 		"""Report attempt to download video info webpage."""
 		self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
 
+	def report_video_subtitles_download(self, video_id):
+		"""Report attempt to download video info webpage."""
+		self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
+
 	def report_information_extraction(self, video_id):
 		"""Report attempt to extract video information."""
 		self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
@@ -1218,6 +1314,23 @@ class YoutubeIE(InfoExtractor):
 		"""Indicate the download will use the RTMP protocol."""
 		self._downloader.to_screen(u'[youtube] RTMP download detected')
 
+	def _closed_captions_xml_to_srt(self, xml_string):
+		srt = ''
+		texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
+		# TODO parse xml instead of regex
+		for n, (start, dur_tag, dur, caption) in enumerate(texts):
+			if not dur: dur = '4'
+			start = float(start)
+			end = start + float(dur)
+			start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
+			end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
+			caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
+			caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
+			srt += str(n) + '\n'
+			srt += start + ' --> ' + end + '\n'
+			srt += caption + '\n\n'
+		return srt
+
 	def _print_formats(self, formats):
 		print 'Available formats:'
 		for x in formats:
@@ -1377,18 +1490,40 @@ class YoutubeIE(InfoExtractor):
 					pass
 
 		# description
-		try:
-			lxml.etree
-		except NameError:
-			video_description = u'No description available.'
-			mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
-			if mobj is not None:
-				video_description = mobj.group(1).decode('utf-8')
-		else:
-			html_parser = lxml.etree.HTMLParser(encoding='utf-8')
-			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
-			video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
-			# TODO use another parser
+		video_description = get_element_by_id("eow-description", video_webpage)
+		if video_description: video_description = clean_html(video_description.decode('utf8'))
+		else: video_description = ''
+			
+		# closed captions
+		video_subtitles = None
+		if self._downloader.params.get('writesubtitles', False):
+			self.report_video_subtitles_download(video_id)
+			request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+			try:
+				srt_list = urllib2.urlopen(request).read()
+			except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+				self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+			else:
+				srt_lang_list = re.findall(r'lang_code="([\w\-]+)"', srt_list)
+				if srt_lang_list:
+					if self._downloader.params.get('subtitleslang', False):
+						srt_lang = self._downloader.params.get('subtitleslang')
+					elif 'en' in srt_lang_list:
+						srt_lang = 'en'
+					else:
+						srt_lang = srt_lang_list[0]
+					if not srt_lang in srt_lang_list:
+						self._downloader.trouble(u'WARNING: no closed captions found in the specified language')
+					else:
+						request = urllib2.Request('http://video.google.com/timedtext?hl=en&lang=%s&v=%s' % (srt_lang, video_id))
+						try:
+							srt_xml = urllib2.urlopen(request).read()
+						except (urllib2.URLError, httplib.HTTPException, socket.error), err:
+							self._downloader.trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
+						else:
+							video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
+				else:
+					self._downloader.trouble(u'WARNING: video has no closed captions')
 
 		# token
 		video_token = urllib.unquote_plus(video_info['token'][0])
@@ -1461,6 +1596,7 @@ class YoutubeIE(InfoExtractor):
 					'thumbnail':	video_thumbnail.decode('utf-8'),
 					'description':	video_description,
 					'player_url':	player_url,
+					'subtitles':	video_subtitles
 				})
 			except UnavailableVideoError, err:
 				self._downloader.trouble(u'\nERROR: unable to download video')
@@ -2090,18 +2226,9 @@ class VimeoIE(InfoExtractor):
 		video_thumbnail = config["video"]["thumbnail"]
 
 		# Extract video description
-		try:
-			lxml.etree
-		except NameError:
-			video_description = u'No description available.'
-			mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
-			if mobj is not None:
-				video_description = mobj.group(1)
-		else:
-			html_parser = lxml.etree.HTMLParser()
-			vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
-			video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
-			# TODO use another parser
+		video_description = get_element_by_id("description", webpage)
+		if video_description: video_description = clean_html(video_description.decode('utf8'))
+		else: video_description = ''
 
 		# Extract upload date
 		video_upload_date = u'NA'
@@ -2507,7 +2634,7 @@ class YoutubePlaylistIE(InfoExtractor):
 
 	_VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
 	_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
-	_VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
+	_VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=PL%s&'
 	_MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
 	_youtube_ie = None
 	IE_NAME = u'youtube:playlist'
@@ -2559,7 +2686,7 @@ class YoutubePlaylistIE(InfoExtractor):
 
 			# Extract video identifiers
 			ids_in_page = []
-			for mobj in re.finditer(self._VIDEO_INDICATOR, page):
+			for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
 				if mobj.group(1) not in ids_in_page:
 					ids_in_page.append(mobj.group(1))
 			video_ids.extend(ids_in_page)
@@ -2570,7 +2697,10 @@ class YoutubePlaylistIE(InfoExtractor):
 
 		playliststart = self._downloader.params.get('playliststart', 1) - 1
 		playlistend = self._downloader.params.get('playlistend', -1)
-		video_ids = video_ids[playliststart:playlistend]
+		if playlistend == -1:
+			video_ids = video_ids[playliststart:]
+		else:
+			video_ids = video_ids[playliststart:playlistend]
 
 		for id in video_ids:
 			self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
@@ -3265,8 +3395,6 @@ class EscapistIE(InfoExtractor):
 		self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
 
 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3282,11 +3410,11 @@ class EscapistIE(InfoExtractor):
 			return
 
 		descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
-		description = htmlParser.unescape(descMatch.group(1))
+		description = unescapeHTML(descMatch.group(1))
 		imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
-		imgUrl = htmlParser.unescape(imgMatch.group(1))
+		imgUrl = unescapeHTML(imgMatch.group(1))
 		playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
-		playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
+		playerUrl = unescapeHTML(playerUrlMatch.group(1))
 		configUrlMatch = re.search('config=(.*)$', playerUrl)
 		configUrl = urllib2.unquote(configUrlMatch.group(1))
 
@@ -3345,8 +3473,6 @@ class CollegeHumorIE(InfoExtractor):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3417,8 +3543,6 @@ class XVideosIE(InfoExtractor):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3507,8 +3631,6 @@ class SoundcloudIE(InfoExtractor):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3596,8 +3718,6 @@ class InfoQIE(InfoExtractor):
 		self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
 
 	def _real_extract(self, url):
-		htmlParser = HTMLParser.HTMLParser()
-
 		mobj = re.match(self._VALID_URL, url)
 		if mobj is None:
 			self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
@@ -3831,8 +3951,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
 			except UnavailableVideoError, err:
 				self._downloader.trouble(u'\nERROR: unable to download video')
 		elif mobj.group('course'): # A course page
-			unescapeHTML = HTMLParser.HTMLParser().unescape
-
 			course = mobj.group('course')
 			info = {
 				'id': _simplify_title(course),
@@ -3869,8 +3987,6 @@ class StanfordOpenClassroomIE(InfoExtractor):
 				assert entry['type'] == 'reference'
 				self.extract(entry['url'])
 		else: # Root page
-			unescapeHTML = HTMLParser.HTMLParser().unescape
-
 			info = {
 				'id': 'Stanford OpenClassroom',
 				'type': 'playlist',
@@ -4316,6 +4432,12 @@ def parseOpts():
 			action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
 	video_format.add_option('-F', '--list-formats',
 			action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
+	video_format.add_option('--write-srt',
+			action='store_true', dest='writesubtitles',
+			help='write video closed captions to a .srt file (currently youtube only)', default=False)
+	video_format.add_option('--srt-lang',
+			action='store', dest='subtitleslang', metavar='LANG',
+			help='language of the closed captions to download (optional) use IETF language tags like \'en\'')
 
 
 	verbosity.add_option('-q', '--quiet',
@@ -4580,6 +4702,8 @@ def _real_main():
 		'updatetime': opts.updatetime,
 		'writedescription': opts.writedescription,
 		'writeinfojson': opts.writeinfojson,
+		'writesubtitles': opts.writesubtitles,
+		'subtitleslang': opts.subtitleslang,
 		'matchtitle': opts.matchtitle,
 		'rejecttitle': opts.rejecttitle,
 		'max_downloads': opts.max_downloads,