X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=59f65aca37108945ac8e1c9aca0dbe9db227225b;hb=5c6760193199530da1e66a1e412b58e238786f51;hp=86cc7c7484334baddc076091189dee1053661b11;hpb=f5a290eed949b7726a8d745960bbe9c6b8b7de52;p=youtube-dl.git
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 86cc7c748..59f65aca3 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -222,6 +222,16 @@ class InfoExtractor(object):
u'please report this issue on GitHub.' % _name)
return None
+ def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ """
+ Like _search_regex, but strips HTML tags and unescapes entities.
+ """
+ res = self._search_regex(pattern, string, name, default, fatal, flags)
+ if res:
+ return clean_html(res).strip()
+ else:
+ return res
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
@@ -1399,6 +1409,9 @@ class GenericIE(InfoExtractor):
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
+ if mobj is None:
+ # Try to find twitter cards info
+ mobj = re.search(r'(.*)', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1)
+ video_title = self._html_search_regex(r'
(.*)',
+ webpage, u'video title')
# video uploader is domain name
- mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_uploader = mobj.group(1)
+ video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+ url, u'video uploader')
return [{
'id': video_id,
@@ -1450,7 +1459,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
def report_download_page(self, query, pagenum):
"""Report attempt to download search page with given number."""
- query = query.decode(preferredencoding())
self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
def _get_n_results(self, query, n):
@@ -1568,7 +1576,7 @@ class YoutubePlaylistIE(InfoExtractor):
|
((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
+ _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
_MAX_RESULTS = 50
IE_NAME = u'youtube:playlist'
@@ -1923,9 +1931,8 @@ class FacebookIE(InfoExtractor):
video_duration = int(video_data['video_duration'])
thumbnail = video_data['thumbnail_src']
- video_title = self._search_regex('',
+ video_title = self._html_search_regex('',
webpage, u'title')
- video_title = unescapeHTML(video_title)
info = {
'id': video_id,
@@ -2087,7 +2094,7 @@ class MyVideoIE(InfoExtractor):
self.report_extraction(video_id)
video_url = mobj.group(1) + '.flv'
- video_title = self._search_regex('([^<]+)',
+ video_title = self._html_search_regex('([^<]+)',
webpage, u'title')
video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
@@ -2169,7 +2176,7 @@ class MyVideoIE(InfoExtractor):
video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
video_swfobj = compat_urllib_parse.unquote(video_swfobj)
- video_title = self._search_regex("(.*?)
",
+ video_title = self._html_search_regex("(.*?)
",
webpage, u'title')
return [{
@@ -2368,25 +2375,25 @@ class EscapistIE(InfoExtractor):
showName = mobj.group('showname')
videoId = mobj.group('episode')
- self.report_extraction(showName)
- webpage = self._download_webpage(url, showName)
+ self.report_extraction(videoId)
+ webpage = self._download_webpage(url, videoId)
- videoDesc = self._search_regex('(.*?)\s+-\s+XVID',
+ video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID',
webpage, u'title')
# Extract video thumbnail
@@ -2665,7 +2672,7 @@ class InfoQIE(InfoExtractor):
webpage, u'title')
# Extract description
- video_description = self._search_regex(r'',
+ video_description = self._html_search_regex(r'',
webpage, u'description', fatal=False)
video_filename = video_url.split('/')[-1]
@@ -2837,12 +2844,10 @@ class StanfordOpenClassroomIE(InfoExtractor):
note='Downloading course info page',
errnote='Unable to download course info page')
- info['title'] = self._search_regex('([^<]+)
', coursepage, 'title', default=info['id'])
- info['title'] = unescapeHTML(info['title'])
+ info['title'] = self._html_search_regex('([^<]+)
', coursepage, 'title', default=info['id'])
- info['description'] = self._search_regex('([^<]+)',
+ info['description'] = self._html_search_regex('([^<]+)',
coursepage, u'description', fatal=False)
- if info['description']: info['description'] = unescapeHTML(info['description'])
links = orderedSet(re.findall('', coursepage))
info['list'] = [
@@ -2903,15 +2908,13 @@ class MTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- song_name = self._search_regex(r'',
+ song_name = self._html_search_regex(r'',
webpage, u'song name', fatal=False)
- if song_name: song_name = unescapeHTML(song_name)
- video_title = self._search_regex(r'',
+ video_title = self._html_search_regex(r'',
webpage, u'title')
- video_title = unescapeHTML(video_title)
- mtvn_uri = self._search_regex(r'',
+ mtvn_uri = self._html_search_regex(r'',
webpage, u'mtvn_uri', fatal=False)
content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
@@ -3067,7 +3070,7 @@ class XNXXIE(InfoExtractor):
webpage, u'video URL')
video_url = compat_urllib_parse.unquote(video_url)
- video_title = self._search_regex(self.VIDEO_TITLE_RE,
+ video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
webpage, u'title')
video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
@@ -3108,7 +3111,7 @@ class GooglePlusIE(InfoExtractor):
self.report_extraction(video_id)
# Extract update date
- upload_date = self._search_regex('title="Timestamp">(.*?)',
+ upload_date = self._html_search_regex('title="Timestamp">(.*?)',
webpage, u'upload date', fatal=False)
if upload_date:
# Convert timestring to a format suitable for filename
@@ -3116,12 +3119,12 @@ class GooglePlusIE(InfoExtractor):
upload_date = upload_date.strftime('%Y%m%d')
# Extract uploader
- uploader = self._search_regex(r'rel\="author".*?>(.*?)',
+ uploader = self._html_search_regex(r'rel\="author".*?>(.*?)',
webpage, u'uploader', fatal=False)
# Extract title
# Get the first line for title
- video_title = self._search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False)
+ # uploader_date = self._html_search_regex(r'Date: (.*?)', webpage, 'upload_date', fatal=False)
- description = self._search_regex(r'', webpage, 'description', fatal=False)
+ description = self._html_search_regex(r'', webpage, 'description', fatal=False)
info = {
'id': shortened_video_id,
@@ -3337,17 +3340,14 @@ class FunnyOrDieIE(InfoExtractor):
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- video_url = self._search_regex(r'