10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vbox7 import Vbox7IE
59 from .extractor.vimeo import VimeoIE
60 from .extractor.vine import VineIE
61 from .extractor.worldstarhiphop import WorldStarHipHopIE
62 from .extractor.xnxx import XNXXIE
63 from .extractor.xvideos import XVideosIE
64 from .extractor.yahoo import YahooIE, YahooSearchIE
65 from .extractor.youjizz import YouJizzIE
66 from .extractor.youku import YoukuIE
67 from .extractor.youporn import YouPornIE
68 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
69 from .extractor.zdf import ZDFIE
105 class RedTubeIE(InfoExtractor):
106 """Information Extractor for redtube"""
107 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
109 def _real_extract(self,url):
110 mobj = re.match(self._VALID_URL, url)
112 raise ExtractorError(u'Invalid URL: %s' % url)
114 video_id = mobj.group('id')
115 video_extension = 'mp4'
116 webpage = self._download_webpage(url, video_id)
118 self.report_extraction(video_id)
120 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
121 webpage, u'video URL')
123 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
129 'ext': video_extension,
130 'title': video_title,
133 class InaIE(InfoExtractor):
134 """Information Extractor for Ina.fr"""
135 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
137 def _real_extract(self,url):
138 mobj = re.match(self._VALID_URL, url)
140 video_id = mobj.group('id')
141 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
142 video_extension = 'mp4'
143 webpage = self._download_webpage(mrss_url, video_id)
145 self.report_extraction(video_id)
147 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
148 webpage, u'video URL')
150 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
156 'ext': video_extension,
157 'title': video_title,
160 class HowcastIE(InfoExtractor):
161 """Information Extractor for Howcast.com"""
162 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
164 def _real_extract(self, url):
165 mobj = re.match(self._VALID_URL, url)
167 video_id = mobj.group('id')
168 webpage_url = 'http://www.howcast.com/videos/' + video_id
169 webpage = self._download_webpage(webpage_url, video_id)
171 self.report_extraction(video_id)
173 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
174 webpage, u'video URL')
176 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
179 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
180 webpage, u'description', fatal=False)
182 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
183 webpage, u'thumbnail', fatal=False)
189 'title': video_title,
190 'description': video_description,
191 'thumbnail': thumbnail,
195 class FlickrIE(InfoExtractor):
196 """Information Extractor for Flickr videos"""
197 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
199 def _real_extract(self, url):
200 mobj = re.match(self._VALID_URL, url)
202 video_id = mobj.group('id')
203 video_uploader_id = mobj.group('uploader_id')
204 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
205 webpage = self._download_webpage(webpage_url, video_id)
207 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
209 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
210 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
212 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
213 first_xml, u'node_id')
215 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
216 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
218 self.report_extraction(video_id)
220 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
222 raise ExtractorError(u'Unable to extract video url')
223 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
225 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
226 webpage, u'video title')
228 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
229 webpage, u'description', fatal=False)
231 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
232 webpage, u'thumbnail', fatal=False)
238 'title': video_title,
239 'description': video_description,
240 'thumbnail': thumbnail,
241 'uploader_id': video_uploader_id,
244 class TeamcocoIE(InfoExtractor):
245 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
247 def _real_extract(self, url):
248 mobj = re.match(self._VALID_URL, url)
250 raise ExtractorError(u'Invalid URL: %s' % url)
251 url_title = mobj.group('url_title')
252 webpage = self._download_webpage(url, url_title)
254 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
255 webpage, u'video id')
257 self.report_extraction(video_id)
259 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
262 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
263 webpage, u'thumbnail', fatal=False)
265 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
266 webpage, u'description', fatal=False)
268 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
269 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
271 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
278 'title': video_title,
279 'thumbnail': thumbnail,
280 'description': video_description,
283 class XHamsterIE(InfoExtractor):
284 """Information Extractor for xHamster"""
285 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
287 def _real_extract(self,url):
288 mobj = re.match(self._VALID_URL, url)
290 video_id = mobj.group('id')
291 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
292 webpage = self._download_webpage(mrss_url, video_id)
294 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
296 raise ExtractorError(u'Unable to extract media URL')
297 if len(mobj.group('server')) == 0:
298 video_url = compat_urllib_parse.unquote(mobj.group('file'))
300 video_url = mobj.group('server')+'/key='+mobj.group('file')
301 video_extension = video_url.split('.')[-1]
303 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
306 # Can't see the description anywhere in the UI
307 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
308 # webpage, u'description', fatal=False)
309 # if video_description: video_description = unescapeHTML(video_description)
311 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
313 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
315 video_upload_date = None
316 self._downloader.report_warning(u'Unable to extract upload date')
318 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
319 webpage, u'uploader id', default=u'anonymous')
321 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
322 webpage, u'thumbnail', fatal=False)
327 'ext': video_extension,
328 'title': video_title,
329 # 'description': video_description,
330 'upload_date': video_upload_date,
331 'uploader_id': video_uploader_id,
332 'thumbnail': video_thumbnail
335 class HypemIE(InfoExtractor):
336 """Information Extractor for hypem"""
337 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
339 def _real_extract(self, url):
340 mobj = re.match(self._VALID_URL, url)
342 raise ExtractorError(u'Invalid URL: %s' % url)
343 track_id = mobj.group(1)
345 data = { 'ax': 1, 'ts': time.time() }
346 data_encoded = compat_urllib_parse.urlencode(data)
347 complete_url = url + "?" + data_encoded
348 request = compat_urllib_request.Request(complete_url)
349 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
350 cookie = urlh.headers.get('Set-Cookie', '')
352 self.report_extraction(track_id)
354 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
355 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
357 track_list = json.loads(html_tracks)
358 track = track_list[u'tracks'][0]
360 raise ExtractorError(u'Hypemachine contained invalid JSON.')
363 track_id = track[u"id"]
364 artist = track[u"artist"]
365 title = track[u"song"]
367 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
368 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
369 request.add_header('cookie', cookie)
370 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
372 song_data = json.loads(song_data_json)
374 raise ExtractorError(u'Hypemachine contained invalid JSON.')
375 final_url = song_data[u"url"]
387 def gen_extractors():
388 """ Return a list of an instance of every supported extractor.
389 The order does matter; the first extractor matched is the one handling the URL.
417 StanfordOpenClassroomIE(),
457 def get_info_extractor(ie_name):
458 """Returns the info extractor class with the given ie_name"""
459 return globals()[ie_name+'IE']