X-Git-Url: http://git.cielonegro.org/gitweb.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=59f65aca37108945ac8e1c9aca0dbe9db227225b;hb=5c6760193199530da1e66a1e412b58e238786f51;hp=bd6fce3b6f684142b56dc6bd94efb58b5df1a030;hpb=be95cac157a75da1a0fa512b36eb90bc2c28cc96;p=youtube-dl.git diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index bd6fce3b6..59f65aca3 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -216,13 +216,22 @@ class InfoExtractor(object): elif default is not None: return default elif fatal: - raise ExtractorError(u'Unable to extract %s; ' - u'please report this issue on GitHub.' % _name) + raise ExtractorError(u'Unable to extract %s' % _name) else: self._downloader.report_warning(u'unable to extract %s; ' u'please report this issue on GitHub.' % _name) return None + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -1400,6 +1409,9 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit: JWPlayer JS loader mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) + if mobj is None: + # Try to find twitter cards info + mobj = re.search(r'(.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'(.*)', + webpage, u'video title') # video uploader is domain name - mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_uploader = mobj.group(1) + video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', + url, u'video uploader') return [{ 'id': video_id, @@ -1451,7 +1459,6 @@ class YoutubeSearchIE(SearchInfoExtractor): def report_download_page(self, query, pagenum): """Report attempt to download search page with given number.""" - query = query.decode(preferredencoding()) self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) def _get_n_results(self, query, n): @@ -1569,7 +1576,7 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' @@ -1924,9 +1931,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - video_title = self._search_regex('

([^<]+)

', + video_title = self._html_search_regex('

([^<]+)

', webpage, u'title') - video_title = unescapeHTML(video_title) info = { 'id': video_id, @@ -2088,7 +2094,7 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - video_title = self._search_regex('([^<]+)', + video_title = self._html_search_regex('([^<]+)', webpage, u'title') video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') @@ -2170,7 +2176,7 @@ class MyVideoIE(InfoExtractor): video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') video_swfobj = compat_urllib_parse.unquote(video_swfobj) - video_title = self._search_regex("(.*?)", + video_title = self._html_search_regex("(.*?)", webpage, u'title') return [{ @@ -2369,25 +2375,25 @@ class EscapistIE(InfoExtractor): showName = mobj.group('showname') videoId = mobj.group('episode') - self.report_extraction(showName) - webpage = self._download_webpage(url, showName) + self.report_extraction(videoId) + webpage = self._download_webpage(url, videoId) - videoDesc = self._search_regex('(.*?)\s+-\s+XVID', + video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', webpage, u'title') # Extract video thumbnail @@ -2666,7 +2672,7 @@ class InfoQIE(InfoExtractor): webpage, u'title') # Extract description - video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] @@ -2838,12 +2844,10 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['title'] = unescapeHTML(info['title']) + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - info['description'] = self._search_regex('<description>([^<]+)</description>', + info['description'] = self._html_search_regex('<description>([^<]+)</description>', coursepage, u'description', fatal=False) - if info['description']: info['description'] = unescapeHTML(info['description']) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2904,15 +2908,13 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage, u'song name', fatal=False) - if song_name: song_name = unescapeHTML(song_name) - video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', webpage, u'title') - video_title = unescapeHTML(video_title) - mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage, u'mtvn_uri', fatal=False) content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', @@ -3068,7 +3070,7 @@ class XNXXIE(InfoExtractor): webpage, u'video URL') video_url = compat_urllib_parse.unquote(video_url) - video_title = self._search_regex(self.VIDEO_TITLE_RE, + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, webpage, u'title') video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, @@ -3109,7 +3111,7 @@ class GooglePlusIE(InfoExtractor): self.report_extraction(video_id) # Extract update date - upload_date = self._search_regex('title="Timestamp">(.*?)</a>', + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', webpage, u'upload date', fatal=False) if upload_date: # Convert timestring to a format suitable for filename @@ -3117,12 +3119,12 @@ class GooglePlusIE(InfoExtractor): upload_date = upload_date.strftime('%Y%m%d') # Extract uploader - uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>', + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video @@ -3176,13 +3178,13 @@ class NBAIE(InfoExtractor): video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' shortened_video_id = video_id.rpartition('/')[2] - title = self._search_regex(r'<meta property="og:title" content="(.*?)"', + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') # It isn't there in the HTML it returns to us - # uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) - description = self._search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) info = { 'id': shortened_video_id, @@ -3338,17 +3340,14 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, u'video URL', flags=re.DOTALL) - video_url = unescapeHTML(video_url) - title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", + title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - title = clean_html(title) - video_description = self._search_regex(r'\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' @classmethod def suitable(cls, url): @@ -3375,11 +3376,19 @@ class SteamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') - videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID - self.report_age_confirmation() + + videourl = self._VIDEO_PAGE_TEMPLATE % gameID webpage = self._download_webpage(videourl, gameID) - game_title = re.search(r'', webpage).group('game_title') - + + if re.search('

Please enter your birth date to continue:

', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % gameID + self.report_age_confirmation() + webpage = self._download_webpage(videourl, gameID) + + self.report_extraction(gameID) + game_title = self._html_search_regex(r'', + webpage, 'game title') + urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' @@ -3417,14 +3426,13 @@ class UstreamIE(InfoExtractor): self.report_extraction(video_id) - video_title = self._search_regex(r'data-title="(?P.+)"', + video_title = self._html_search_regex(r'data-title="(?P<title>.+)"', webpage, u'title') - uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', webpage, u'uploader', fatal=False, flags=re.DOTALL) - if uploader: uploader = unescapeHTML(uploader.strip()) - thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage, u'thumbnail', fatal=False) info = { @@ -3455,11 +3463,11 @@ class WorldStarHipHopIE(InfoExtractor): else: ext = 'flv' - video_title = self._search_regex(r"<title>(.*)", + video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />', + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', webpage_src, u'thumbnail', fatal=False) if not thumbnail: @@ -3584,14 +3592,14 @@ class YouPornIE(InfoExtractor): size = format[0] bitrate = format[1] format = "-".join( format ) - title = u'%s-%s-%s' % (video_title, size, bitrate) + # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': upload_date, - 'title': title, + 'title': video_title, 'ext': extension, 'format': format, 'thumbnail': thumbnail, @@ -3641,7 +3649,7 @@ class PornotubeIE(InfoExtractor): #Get the uploaded date VIDEO_UPLOADED_RE = r'
Added (?P[0-9\/]+) by' - upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, @@ -3669,7 +3677,7 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - video_title = self._search_regex(r'(?P<title>.*)', + video_title = self._html_search_regex(r'(?P<title>.*)', webpage, u'title').strip() # Get the embed page @@ -3748,13 +3756,11 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'[\S\s]+?

(?P.+?)

', + uploader = self._html_search_regex(r'
[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) - if uploader: uploader = clean_html(uploader) info = { 'id': video_id, @@ -3908,9 +3914,8 @@ class SpiegelIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_title = self._search_regex(r'
(.*?)
', + video_title = self._html_search_regex(r'
(.*?)
', webpage, u'title') - video_title = unescapeHTML(video_title) xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -3949,15 +3954,13 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._search_regex(r'', + video_uploader = self._html_search_regex(r'By:.*?(\w+)', webpage, u'uploader', fatal=False) info = { @@ -4010,6 +4013,64 @@ class ARDIE(InfoExtractor): info["url"] = stream["video_url"] return [info] +class ZDFIE(InfoExtractor): + _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P[^/\?]+)(?:\?.*)?' + _TITLE = r'(?P.*)</h1>' + _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' + _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' + _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('video_id') + + html = self._download_webpage(url, video_id) + streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] + if streams is None: + raise ExtractorError(u'No media url found.') + + # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url + # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url + # choose first/default media type and highest quality for now + for s in streams: #find 300 - dsl1000mbit + if s['quality'] == '300' and s['media_type'] == 'wstreaming': + stream_=s + break + for s in streams: #find veryhigh - dsl2000mbit + if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working + stream_=s + break + if stream_ is None: + raise ExtractorError(u'No stream found.') + + media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') + + self.report_extraction(video_id) + mobj = re.search(self._TITLE, html) + if mobj is None: + raise ExtractorError(u'Cannot extract title') + title = unescapeHTML(mobj.group('title')) + + mobj = re.search(self._MMS_STREAM, media_link) + if mobj is None: + mobj = re.search(self._RTSP_STREAM, media_link) + if mobj is None: + raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') + mms_url = mobj.group('video_url') + + mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) + if mobj is None: + raise ExtractorError(u'Cannot extract extention') + ext = mobj.group('ext') + + return [{'id': video_id, + 'url': mms_url, + 'title': title, + 'ext': ext + }] + class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' @@ -4034,9 +4095,8 @@ class TumblrIE(InfoExtractor): # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - video_title = self._search_regex(r'<title>(?P<title>.*?)', + video_title = self._html_search_regex(r'(?P<title>.*?)', webpage, u'title', flags=re.DOTALL) - video_title = unescapeHTML(video_title) return [{'id': video_id, 'url': video_url, @@ -4106,10 +4166,10 @@ class RedTubeIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'', + video_url = self._html_search_regex(r'', webpage, u'video URL') - video_title = self._search_regex('

(.+?)

', + video_title = self._html_search_regex('

(.+?)

', webpage, u'title') return [{ @@ -4133,7 +4193,7 @@ class InaIE(InfoExtractor): self.report_extraction(video_id) - video_url = self._search_regex(r'.*?)]]>', @@ -4162,13 +4222,13 @@ class HowcastIE(InfoExtractor): video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', webpage, u'video URL') - video_title = self._search_regex(r'.*?

(.+?)

', + uploader = self._html_search_regex(r'
.*?

(.+?)

', webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ @@ -4231,7 +4291,7 @@ class FlickrIE(InfoExtractor): first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - node_id = self._search_regex(r'(\d+-\d+)', + node_id = self._html_search_regex(r'(\d+-\d+)', first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' @@ -4244,13 +4304,13 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - video_title = self._search_regex(r'(.*?)', + video_url = self._html_search_regex(r'(.*?)', data, u'video URL') return [{ @@ -4301,7 +4361,7 @@ class TeamcocoIE(InfoExtractor): 'thumbnail': thumbnail, 'description': video_description, }] - + class XHamsterIE(InfoExtractor): """Information Extractor for xHamster""" _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' @@ -4310,8 +4370,9 @@ class XHamsterIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - mrss_url='http://xhamster.com/movies/%s/.html' % video_id + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -4321,39 +4382,33 @@ class XHamsterIE(InfoExtractor): video_url = mobj.group('server')+'/key='+mobj.group('file') video_extension = video_url.split('.')[-1] - mobj = re.search(r'(?P<title>.+?) - xHamster\.com', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) + video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', + webpage, u'title') - mobj = re.search(r'Description: (?P[^<]+)', webpage) - if mobj is None: - video_description = u'' - else: - video_description = unescapeHTML(mobj.group('description')) + # Can't see the description anywhere in the UI + # video_description = self._html_search_regex(r'Description: (?P[^<]+)', + # webpage, u'description', fatal=False) + # if video_description: video_description = unescapeHTML(video_description) mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract upload date') - video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') - - mobj = re.search(r']+>(?P[^>]+)', webpage) - if mobj is None: - video_uploader_id = u'anonymous' + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') else: - video_uploader_id = mobj.group('uploader_id') + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') - mobj = re.search(r'\'image\':\'(?P[^\']+)\'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract thumbnail URL') - video_thumbnail = mobj.group('thumbnail') + video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, 'url': video_url, 'ext': video_extension, 'title': video_title, - 'description': video_description, + # 'description': video_description, 'upload_date': video_upload_date, 'uploader_id': video_uploader_id, 'thumbnail': video_thumbnail @@ -4377,10 +4432,9 @@ class HypemIE(InfoExtractor): cookie = urlh.headers.get('Set-Cookie', '') self.report_extraction(track_id) - mobj = re.search(r'', response, flags=re.MULTILINE|re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extrack tracks') - html_tracks = mobj.group(1).strip() + + html_tracks = self._html_search_regex(r'', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() try: track_list = json.loads(html_tracks) track = track_list[u'tracks'][0] @@ -4410,6 +4464,92 @@ class HypemIE(InfoExtractor): 'artist': artist, }] +class Vbox7IE(InfoExtractor): + """Information Extractor for Vbox7""" + _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + + redirect_page, urlh = self._download_webpage_handle(url, video_id) + new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') + redirect_url = urlh.geturl() + new_location + webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') + + title = self._html_search_regex(r'(.*)', + webpage, u'title').split('/')[0].strip() + + ext = "flv" + info_url = "http://vbox7.com/play/magare.do" + data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) + info_request = compat_urllib_request.Request(info_url, data) + info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') + if info_response is None: + raise ExtractorError(u'Unable to extract the media url') + (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + + return [{ + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, + }] + +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('id') + video_type = mobj.group('type') + webpage = self._download_webpage(url, video_id) + if video_type == 'full-episodes': + mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' + else: + mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' + mgid = self._search_regex(mgid_re, webpage, u'mgid') + data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) + + info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, + video_id, u'Downloading video info') + links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, + video_id, u'Downloading video urls info') + + self.report_extraction(video_id) + info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* + .*?)\]\]>.* + .* + (?P.*?).* + ''' + + m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) + if m_info is None: + raise ExtractorError(u'Unable to extract video info') + video_title = m_info.group('title') + video_description = m_info.group('description') + video_thumb = m_info.group('thumb') + + m_urls = list(re.finditer(r'(?P.*)', links_webpage)) + if m_urls is None or len(m_urls) == 0: + raise ExtractError(u'Unable to extrat video url') + # They are sorted from worst to best quality + video_url = m_urls[-1].group('url') + + return {'url': video_url, + 'id': video_id, + 'title': video_title, + # Videos are actually flv not mp4 + 'ext': 'flv', + 'thumbnail': video_thumb, + 'description': video_description, + } def gen_extractors(): """ Return a list of an instance of every supported extractor. @@ -4464,6 +4604,7 @@ def gen_extractors(): SpiegelIE(), LiveLeakIE(), ARDIE(), + ZDFIE(), TumblrIE(), BandcampIE(), RedTubeIE(), @@ -4474,6 +4615,8 @@ def gen_extractors(): TeamcocoIE(), XHamsterIE(), HypemIE(), + Vbox7IE(), + GametrailersIE(), GenericIE() ]