10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vimeo import VimeoIE
59 from .extractor.worldstarhiphop import WorldStarHipHopIE
60 from .extractor.xnxx import XNXXIE
61 from .extractor.xvideos import XVideosIE
62 from .extractor.yahoo import YahooIE, YahooSearchIE
63 from .extractor.youjizz import YouJizzIE
64 from .extractor.youku import YoukuIE
65 from .extractor.youporn import YouPornIE
66 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
67 from .extractor.zdf import ZDFIE
103 class RedTubeIE(InfoExtractor):
104 """Information Extractor for redtube"""
105 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
107 def _real_extract(self,url):
108 mobj = re.match(self._VALID_URL, url)
110 raise ExtractorError(u'Invalid URL: %s' % url)
112 video_id = mobj.group('id')
113 video_extension = 'mp4'
114 webpage = self._download_webpage(url, video_id)
116 self.report_extraction(video_id)
118 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
119 webpage, u'video URL')
121 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
127 'ext': video_extension,
128 'title': video_title,
131 class InaIE(InfoExtractor):
132 """Information Extractor for Ina.fr"""
133 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
135 def _real_extract(self,url):
136 mobj = re.match(self._VALID_URL, url)
138 video_id = mobj.group('id')
139 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
140 video_extension = 'mp4'
141 webpage = self._download_webpage(mrss_url, video_id)
143 self.report_extraction(video_id)
145 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
146 webpage, u'video URL')
148 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
154 'ext': video_extension,
155 'title': video_title,
158 class HowcastIE(InfoExtractor):
159 """Information Extractor for Howcast.com"""
160 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
162 def _real_extract(self, url):
163 mobj = re.match(self._VALID_URL, url)
165 video_id = mobj.group('id')
166 webpage_url = 'http://www.howcast.com/videos/' + video_id
167 webpage = self._download_webpage(webpage_url, video_id)
169 self.report_extraction(video_id)
171 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
172 webpage, u'video URL')
174 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
177 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
178 webpage, u'description', fatal=False)
180 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
181 webpage, u'thumbnail', fatal=False)
187 'title': video_title,
188 'description': video_description,
189 'thumbnail': thumbnail,
192 class VineIE(InfoExtractor):
193 """Information Extractor for Vine.co"""
194 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
196 def _real_extract(self, url):
197 mobj = re.match(self._VALID_URL, url)
199 video_id = mobj.group('id')
200 webpage_url = 'https://vine.co/v/' + video_id
201 webpage = self._download_webpage(webpage_url, video_id)
203 self.report_extraction(video_id)
205 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
206 webpage, u'video URL')
208 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
211 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
212 webpage, u'thumbnail', fatal=False)
214 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
215 webpage, u'uploader', fatal=False, flags=re.DOTALL)
221 'title': video_title,
222 'thumbnail': thumbnail,
223 'uploader': uploader,
226 class FlickrIE(InfoExtractor):
227 """Information Extractor for Flickr videos"""
228 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
230 def _real_extract(self, url):
231 mobj = re.match(self._VALID_URL, url)
233 video_id = mobj.group('id')
234 video_uploader_id = mobj.group('uploader_id')
235 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
236 webpage = self._download_webpage(webpage_url, video_id)
238 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
240 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
241 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
243 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
244 first_xml, u'node_id')
246 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
247 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
249 self.report_extraction(video_id)
251 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
253 raise ExtractorError(u'Unable to extract video url')
254 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
256 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
257 webpage, u'video title')
259 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
260 webpage, u'description', fatal=False)
262 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
263 webpage, u'thumbnail', fatal=False)
269 'title': video_title,
270 'description': video_description,
271 'thumbnail': thumbnail,
272 'uploader_id': video_uploader_id,
275 class TeamcocoIE(InfoExtractor):
276 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
278 def _real_extract(self, url):
279 mobj = re.match(self._VALID_URL, url)
281 raise ExtractorError(u'Invalid URL: %s' % url)
282 url_title = mobj.group('url_title')
283 webpage = self._download_webpage(url, url_title)
285 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
286 webpage, u'video id')
288 self.report_extraction(video_id)
290 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
293 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
294 webpage, u'thumbnail', fatal=False)
296 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
297 webpage, u'description', fatal=False)
299 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
300 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
302 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
309 'title': video_title,
310 'thumbnail': thumbnail,
311 'description': video_description,
314 class XHamsterIE(InfoExtractor):
315 """Information Extractor for xHamster"""
316 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
318 def _real_extract(self,url):
319 mobj = re.match(self._VALID_URL, url)
321 video_id = mobj.group('id')
322 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
323 webpage = self._download_webpage(mrss_url, video_id)
325 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
327 raise ExtractorError(u'Unable to extract media URL')
328 if len(mobj.group('server')) == 0:
329 video_url = compat_urllib_parse.unquote(mobj.group('file'))
331 video_url = mobj.group('server')+'/key='+mobj.group('file')
332 video_extension = video_url.split('.')[-1]
334 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
337 # Can't see the description anywhere in the UI
338 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
339 # webpage, u'description', fatal=False)
340 # if video_description: video_description = unescapeHTML(video_description)
342 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
344 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
346 video_upload_date = None
347 self._downloader.report_warning(u'Unable to extract upload date')
349 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
350 webpage, u'uploader id', default=u'anonymous')
352 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
353 webpage, u'thumbnail', fatal=False)
358 'ext': video_extension,
359 'title': video_title,
360 # 'description': video_description,
361 'upload_date': video_upload_date,
362 'uploader_id': video_uploader_id,
363 'thumbnail': video_thumbnail
366 class HypemIE(InfoExtractor):
367 """Information Extractor for hypem"""
368 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
370 def _real_extract(self, url):
371 mobj = re.match(self._VALID_URL, url)
373 raise ExtractorError(u'Invalid URL: %s' % url)
374 track_id = mobj.group(1)
376 data = { 'ax': 1, 'ts': time.time() }
377 data_encoded = compat_urllib_parse.urlencode(data)
378 complete_url = url + "?" + data_encoded
379 request = compat_urllib_request.Request(complete_url)
380 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
381 cookie = urlh.headers.get('Set-Cookie', '')
383 self.report_extraction(track_id)
385 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
386 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
388 track_list = json.loads(html_tracks)
389 track = track_list[u'tracks'][0]
391 raise ExtractorError(u'Hypemachine contained invalid JSON.')
394 track_id = track[u"id"]
395 artist = track[u"artist"]
396 title = track[u"song"]
398 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
399 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
400 request.add_header('cookie', cookie)
401 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
403 song_data = json.loads(song_data_json)
405 raise ExtractorError(u'Hypemachine contained invalid JSON.')
406 final_url = song_data[u"url"]
416 class Vbox7IE(InfoExtractor):
417 """Information Extractor for Vbox7"""
418 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
420 def _real_extract(self,url):
421 mobj = re.match(self._VALID_URL, url)
423 raise ExtractorError(u'Invalid URL: %s' % url)
424 video_id = mobj.group(1)
426 redirect_page, urlh = self._download_webpage_handle(url, video_id)
427 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
428 redirect_url = urlh.geturl() + new_location
429 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
431 title = self._html_search_regex(r'<title>(.*)</title>',
432 webpage, u'title').split('/')[0].strip()
435 info_url = "http://vbox7.com/play/magare.do"
436 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
437 info_request = compat_urllib_request.Request(info_url, data)
438 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
439 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
440 if info_response is None:
441 raise ExtractorError(u'Unable to extract the media url')
442 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
449 'thumbnail': thumbnail_url,
453 def gen_extractors():
454 """ Return a list of an instance of every supported extractor.
455 The order does matter; the first extractor matched is the one handling the URL.
483 StanfordOpenClassroomIE(),
523 def get_info_extractor(ie_name):
524 """Returns the info extractor class with the given ie_name"""
525 return globals()[ie_name+'IE']