10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
45 from .extractor.stanfordoc import StanfordOpenClassroomIE
46 from .extractor.steam import SteamIE
47 from .extractor.ted import TEDIE
48 from .extractor.ustream import UstreamIE
49 from .extractor.vimeo import VimeoIE
50 from .extractor.worldstarhiphop import WorldStarHipHopIE
51 from .extractor.xnxx import XNXXIE
52 from .extractor.xvideos import XVideosIE
53 from .extractor.yahoo import YahooIE, YahooSearchIE
54 from .extractor.youku import YoukuIE
55 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
56 from .extractor.zdf import ZDFIE
75 class RBMARadioIE(InfoExtractor):
76 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
78 def _real_extract(self, url):
79 m = re.match(self._VALID_URL, url)
80 video_id = m.group('videoID')
82 webpage = self._download_webpage(url, video_id)
84 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
85 webpage, u'json data', flags=re.MULTILINE)
88 data = json.loads(json_data)
89 except ValueError as e:
90 raise ExtractorError(u'Invalid JSON: ' + str(e))
92 video_url = data['akamai_url'] + '&cbr=256'
93 url_parts = compat_urllib_parse_urlparse(video_url)
94 video_ext = url_parts.path.rpartition('.')[2]
99 'title': data['title'],
100 'description': data.get('teaser_text'),
101 'location': data.get('country_of_origin'),
102 'uploader': data.get('host', {}).get('name'),
103 'uploader_id': data.get('host', {}).get('slug'),
104 'thumbnail': data.get('image', {}).get('large_url_2x'),
105 'duration': data.get('duration'),
110 class YouPornIE(InfoExtractor):
111 """Information extractor for youporn.com."""
112 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
114 def _print_formats(self, formats):
115 """Print all available formats"""
116 print(u'Available formats:')
117 print(u'ext\t\tformat')
118 print(u'---------------------------------')
119 for format in formats:
120 print(u'%s\t\t%s' % (format['ext'], format['format']))
122 def _specific(self, req_format, formats):
124 if(x["format"]==req_format):
128 def _real_extract(self, url):
129 mobj = re.match(self._VALID_URL, url)
131 raise ExtractorError(u'Invalid URL: %s' % url)
132 video_id = mobj.group('videoid')
134 req = compat_urllib_request.Request(url)
135 req.add_header('Cookie', 'age_verified=1')
136 webpage = self._download_webpage(req, video_id)
138 # Get JSON parameters
139 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
141 params = json.loads(json_params)
143 raise ExtractorError(u'Invalid JSON')
145 self.report_extraction(video_id)
147 video_title = params['title']
148 upload_date = unified_strdate(params['release_date_f'])
149 video_description = params['description']
150 video_uploader = params['submitted_by']
151 thumbnail = params['thumbnails'][0]['image']
153 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
155 # Get all of the formats available
156 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
157 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
158 webpage, u'download list').strip()
160 # Get all of the links from the page
161 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
162 links = re.findall(LINK_RE, download_list_html)
164 raise ExtractorError(u'ERROR: no known formats available for video')
166 self.to_screen(u'Links found: %d' % len(links))
171 # A link looks like this:
172 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
173 # A path looks like this:
174 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
175 video_url = unescapeHTML( link )
176 path = compat_urllib_parse_urlparse( video_url ).path
177 extension = os.path.splitext( path )[1][1:]
178 format = path.split('/')[4].split('_')[:2]
181 format = "-".join( format )
182 # title = u'%s-%s-%s' % (video_title, size, bitrate)
187 'uploader': video_uploader,
188 'upload_date': upload_date,
189 'title': video_title,
192 'thumbnail': thumbnail,
193 'description': video_description
196 if self._downloader.params.get('listformats', None):
197 self._print_formats(formats)
200 req_format = self._downloader.params.get('format', None)
201 self.to_screen(u'Format: %s' % req_format)
203 if req_format is None or req_format == 'best':
205 elif req_format == 'worst':
207 elif req_format in ('-1', 'all'):
210 format = self._specific( req_format, formats )
212 raise ExtractorError(u'Requested format not available')
217 class PornotubeIE(InfoExtractor):
218 """Information extractor for pornotube.com."""
219 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
221 def _real_extract(self, url):
222 mobj = re.match(self._VALID_URL, url)
224 raise ExtractorError(u'Invalid URL: %s' % url)
226 video_id = mobj.group('videoid')
227 video_title = mobj.group('title')
229 # Get webpage content
230 webpage = self._download_webpage(url, video_id)
233 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
234 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
235 video_url = compat_urllib_parse.unquote(video_url)
237 #Get the uploaded date
238 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
239 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
240 if upload_date: upload_date = unified_strdate(upload_date)
242 info = {'id': video_id,
245 'upload_date': upload_date,
246 'title': video_title,
252 class YouJizzIE(InfoExtractor):
253 """Information extractor for youjizz.com."""
254 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
256 def _real_extract(self, url):
257 mobj = re.match(self._VALID_URL, url)
259 raise ExtractorError(u'Invalid URL: %s' % url)
261 video_id = mobj.group('videoid')
263 # Get webpage content
264 webpage = self._download_webpage(url, video_id)
266 # Get the video title
267 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
268 webpage, u'title').strip()
271 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
273 raise ExtractorError(u'ERROR: unable to extract embed page')
275 embed_page_url = result.group(0).strip()
276 video_id = result.group('videoid')
278 webpage = self._download_webpage(embed_page_url, video_id)
281 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
282 webpage, u'video URL')
284 info = {'id': video_id,
286 'title': video_title,
289 'player_url': embed_page_url}
293 class EightTracksIE(InfoExtractor):
295 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
297 def _real_extract(self, url):
298 mobj = re.match(self._VALID_URL, url)
300 raise ExtractorError(u'Invalid URL: %s' % url)
301 playlist_id = mobj.group('id')
303 webpage = self._download_webpage(url, playlist_id)
305 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
306 data = json.loads(json_like)
308 session = str(random.randint(0, 1000000000))
310 track_count = data['tracks_count']
311 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
314 for i in itertools.count():
315 api_json = self._download_webpage(next_url, playlist_id,
316 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
317 errnote=u'Failed to download song information')
318 api_data = json.loads(api_json)
319 track_data = api_data[u'set']['track']
321 'id': track_data['id'],
322 'url': track_data['track_file_stream_url'],
323 'title': track_data['performer'] + u' - ' + track_data['name'],
324 'raw_title': track_data['name'],
325 'uploader_id': data['user']['login'],
329 if api_data['set']['at_last_track']:
331 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
334 class KeekIE(InfoExtractor):
335 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
338 def _real_extract(self, url):
339 m = re.match(self._VALID_URL, url)
340 video_id = m.group('videoID')
342 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
343 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
344 webpage = self._download_webpage(url, video_id)
346 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
349 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
350 webpage, u'uploader', fatal=False)
356 'title': video_title,
357 'thumbnail': thumbnail,
363 class MySpassIE(InfoExtractor):
364 _VALID_URL = r'http://www.myspass.de/.*'
366 def _real_extract(self, url):
367 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
369 # video id is the last path element of the URL
370 # usually there is a trailing slash, so also try the second but last
371 url_path = compat_urllib_parse_urlparse(url).path
372 url_parent_path, video_id = os.path.split(url_path)
374 _, video_id = os.path.split(url_parent_path)
377 metadata_url = META_DATA_URL_TEMPLATE % video_id
378 metadata_text = self._download_webpage(metadata_url, video_id)
379 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
381 # extract values from metadata
382 url_flv_el = metadata.find('url_flv')
383 if url_flv_el is None:
384 raise ExtractorError(u'Unable to extract download url')
385 video_url = url_flv_el.text
386 extension = os.path.splitext(video_url)[1][1:]
387 title_el = metadata.find('title')
389 raise ExtractorError(u'Unable to extract title')
390 title = title_el.text
391 format_id_el = metadata.find('format_id')
392 if format_id_el is None:
395 format = format_id_el.text
396 description_el = metadata.find('description')
397 if description_el is not None:
398 description = description_el.text
401 imagePreview_el = metadata.find('imagePreview')
402 if imagePreview_el is not None:
403 thumbnail = imagePreview_el.text
412 'thumbnail': thumbnail,
413 'description': description
417 class SpiegelIE(InfoExtractor):
418 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
420 def _real_extract(self, url):
421 m = re.match(self._VALID_URL, url)
422 video_id = m.group('videoID')
424 webpage = self._download_webpage(url, video_id)
426 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
429 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
430 xml_code = self._download_webpage(xml_url, video_id,
431 note=u'Downloading XML', errnote=u'Failed to download XML')
433 idoc = xml.etree.ElementTree.fromstring(xml_code)
435 filename = last_type.findall('./filename')[0].text
436 duration = float(last_type.findall('./duration')[0].text)
438 video_url = 'http://video2.spiegel.de/flash/' + filename
439 video_ext = filename.rpartition('.')[2]
444 'title': video_title,
445 'duration': duration,
449 class LiveLeakIE(InfoExtractor):
451 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
452 IE_NAME = u'liveleak'
454 def _real_extract(self, url):
455 mobj = re.match(self._VALID_URL, url)
457 raise ExtractorError(u'Invalid URL: %s' % url)
459 video_id = mobj.group('video_id')
461 webpage = self._download_webpage(url, video_id)
463 video_url = self._search_regex(r'file: "(.*?)",',
464 webpage, u'video URL')
466 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
467 webpage, u'title').replace('LiveLeak.com -', '').strip()
469 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
470 webpage, u'description', fatal=False)
472 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
473 webpage, u'uploader', fatal=False)
479 'title': video_title,
480 'description': video_description,
481 'uploader': video_uploader
488 class TumblrIE(InfoExtractor):
489 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
491 def _real_extract(self, url):
492 m_url = re.match(self._VALID_URL, url)
493 video_id = m_url.group('id')
494 blog = m_url.group('blog_name')
496 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
497 webpage = self._download_webpage(url, video_id)
499 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
500 video = re.search(re_video, webpage)
502 raise ExtractorError(u'Unable to extract video')
503 video_url = video.group('video_url')
504 ext = video.group('ext')
506 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
507 webpage, u'thumbnail', fatal=False) # We pick the first poster
508 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
510 # The only place where you can get a title, it's not complete,
511 # but searching in other places doesn't work for all videos
512 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
513 webpage, u'title', flags=re.DOTALL)
515 return [{'id': video_id,
517 'title': video_title,
518 'thumbnail': video_thumbnail,
522 class BandcampIE(InfoExtractor):
523 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
525 def _real_extract(self, url):
526 mobj = re.match(self._VALID_URL, url)
527 title = mobj.group('title')
528 webpage = self._download_webpage(url, title)
529 # We get the link to the free download page
530 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
531 if m_download is None:
532 raise ExtractorError(u'No free songs found')
534 download_link = m_download.group(1)
535 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
536 webpage, re.MULTILINE|re.DOTALL).group('id')
538 download_webpage = self._download_webpage(download_link, id,
539 'Downloading free downloads page')
540 # We get the dictionary of the track from some javascrip code
541 info = re.search(r'items: (.*?),$',
542 download_webpage, re.MULTILINE).group(1)
543 info = json.loads(info)[0]
544 # We pick mp3-320 for now, until format selection can be easily implemented.
545 mp3_info = info[u'downloads'][u'mp3-320']
546 # If we try to use this url it says the link has expired
547 initial_url = mp3_info[u'url']
548 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
549 m_url = re.match(re_url, initial_url)
550 #We build the url we will use to get the final track url
551 # This url is build in Bandcamp in the script download_bunde_*.js
552 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
553 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
554 # If we could correctly generate the .rand field the url would be
555 #in the "download_url" key
556 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
558 track_info = {'id':id,
559 'title' : info[u'title'],
562 'thumbnail' : info[u'thumb_url'],
563 'uploader' : info[u'artist']
568 class RedTubeIE(InfoExtractor):
569 """Information Extractor for redtube"""
570 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
572 def _real_extract(self,url):
573 mobj = re.match(self._VALID_URL, url)
575 raise ExtractorError(u'Invalid URL: %s' % url)
577 video_id = mobj.group('id')
578 video_extension = 'mp4'
579 webpage = self._download_webpage(url, video_id)
581 self.report_extraction(video_id)
583 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
584 webpage, u'video URL')
586 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
592 'ext': video_extension,
593 'title': video_title,
596 class InaIE(InfoExtractor):
597 """Information Extractor for Ina.fr"""
598 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
600 def _real_extract(self,url):
601 mobj = re.match(self._VALID_URL, url)
603 video_id = mobj.group('id')
604 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
605 video_extension = 'mp4'
606 webpage = self._download_webpage(mrss_url, video_id)
608 self.report_extraction(video_id)
610 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
611 webpage, u'video URL')
613 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
619 'ext': video_extension,
620 'title': video_title,
623 class HowcastIE(InfoExtractor):
624 """Information Extractor for Howcast.com"""
625 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
627 def _real_extract(self, url):
628 mobj = re.match(self._VALID_URL, url)
630 video_id = mobj.group('id')
631 webpage_url = 'http://www.howcast.com/videos/' + video_id
632 webpage = self._download_webpage(webpage_url, video_id)
634 self.report_extraction(video_id)
636 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
637 webpage, u'video URL')
639 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
642 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
643 webpage, u'description', fatal=False)
645 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
646 webpage, u'thumbnail', fatal=False)
652 'title': video_title,
653 'description': video_description,
654 'thumbnail': thumbnail,
657 class VineIE(InfoExtractor):
658 """Information Extractor for Vine.co"""
659 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
661 def _real_extract(self, url):
662 mobj = re.match(self._VALID_URL, url)
664 video_id = mobj.group('id')
665 webpage_url = 'https://vine.co/v/' + video_id
666 webpage = self._download_webpage(webpage_url, video_id)
668 self.report_extraction(video_id)
670 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
671 webpage, u'video URL')
673 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
676 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
677 webpage, u'thumbnail', fatal=False)
679 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
680 webpage, u'uploader', fatal=False, flags=re.DOTALL)
686 'title': video_title,
687 'thumbnail': thumbnail,
688 'uploader': uploader,
691 class FlickrIE(InfoExtractor):
692 """Information Extractor for Flickr videos"""
693 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
695 def _real_extract(self, url):
696 mobj = re.match(self._VALID_URL, url)
698 video_id = mobj.group('id')
699 video_uploader_id = mobj.group('uploader_id')
700 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
701 webpage = self._download_webpage(webpage_url, video_id)
703 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
705 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
706 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
708 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
709 first_xml, u'node_id')
711 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
712 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
714 self.report_extraction(video_id)
716 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
718 raise ExtractorError(u'Unable to extract video url')
719 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
721 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
722 webpage, u'video title')
724 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
725 webpage, u'description', fatal=False)
727 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
728 webpage, u'thumbnail', fatal=False)
734 'title': video_title,
735 'description': video_description,
736 'thumbnail': thumbnail,
737 'uploader_id': video_uploader_id,
740 class TeamcocoIE(InfoExtractor):
741 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
743 def _real_extract(self, url):
744 mobj = re.match(self._VALID_URL, url)
746 raise ExtractorError(u'Invalid URL: %s' % url)
747 url_title = mobj.group('url_title')
748 webpage = self._download_webpage(url, url_title)
750 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
751 webpage, u'video id')
753 self.report_extraction(video_id)
755 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
758 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
759 webpage, u'thumbnail', fatal=False)
761 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
762 webpage, u'description', fatal=False)
764 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
765 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
767 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
774 'title': video_title,
775 'thumbnail': thumbnail,
776 'description': video_description,
779 class XHamsterIE(InfoExtractor):
780 """Information Extractor for xHamster"""
781 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
783 def _real_extract(self,url):
784 mobj = re.match(self._VALID_URL, url)
786 video_id = mobj.group('id')
787 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
788 webpage = self._download_webpage(mrss_url, video_id)
790 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
792 raise ExtractorError(u'Unable to extract media URL')
793 if len(mobj.group('server')) == 0:
794 video_url = compat_urllib_parse.unquote(mobj.group('file'))
796 video_url = mobj.group('server')+'/key='+mobj.group('file')
797 video_extension = video_url.split('.')[-1]
799 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
802 # Can't see the description anywhere in the UI
803 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
804 # webpage, u'description', fatal=False)
805 # if video_description: video_description = unescapeHTML(video_description)
807 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
809 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
811 video_upload_date = None
812 self._downloader.report_warning(u'Unable to extract upload date')
814 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
815 webpage, u'uploader id', default=u'anonymous')
817 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
818 webpage, u'thumbnail', fatal=False)
823 'ext': video_extension,
824 'title': video_title,
825 # 'description': video_description,
826 'upload_date': video_upload_date,
827 'uploader_id': video_uploader_id,
828 'thumbnail': video_thumbnail
831 class HypemIE(InfoExtractor):
832 """Information Extractor for hypem"""
833 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
835 def _real_extract(self, url):
836 mobj = re.match(self._VALID_URL, url)
838 raise ExtractorError(u'Invalid URL: %s' % url)
839 track_id = mobj.group(1)
841 data = { 'ax': 1, 'ts': time.time() }
842 data_encoded = compat_urllib_parse.urlencode(data)
843 complete_url = url + "?" + data_encoded
844 request = compat_urllib_request.Request(complete_url)
845 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
846 cookie = urlh.headers.get('Set-Cookie', '')
848 self.report_extraction(track_id)
850 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
851 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
853 track_list = json.loads(html_tracks)
854 track = track_list[u'tracks'][0]
856 raise ExtractorError(u'Hypemachine contained invalid JSON.')
859 track_id = track[u"id"]
860 artist = track[u"artist"]
861 title = track[u"song"]
863 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
864 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
865 request.add_header('cookie', cookie)
866 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
868 song_data = json.loads(song_data_json)
870 raise ExtractorError(u'Hypemachine contained invalid JSON.')
871 final_url = song_data[u"url"]
881 class Vbox7IE(InfoExtractor):
882 """Information Extractor for Vbox7"""
883 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
885 def _real_extract(self,url):
886 mobj = re.match(self._VALID_URL, url)
888 raise ExtractorError(u'Invalid URL: %s' % url)
889 video_id = mobj.group(1)
891 redirect_page, urlh = self._download_webpage_handle(url, video_id)
892 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
893 redirect_url = urlh.geturl() + new_location
894 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
896 title = self._html_search_regex(r'<title>(.*)</title>',
897 webpage, u'title').split('/')[0].strip()
900 info_url = "http://vbox7.com/play/magare.do"
901 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
902 info_request = compat_urllib_request.Request(info_url, data)
903 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
904 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
905 if info_response is None:
906 raise ExtractorError(u'Unable to extract the media url')
907 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
914 'thumbnail': thumbnail_url,
918 def gen_extractors():
919 """ Return a list of an instance of every supported extractor.
920 The order does matter; the first extractor matched is the one handling the URL.
948 StanfordOpenClassroomIE(),
988 def get_info_extractor(ie_name):
989 """Returns the info extractor class with the given ie_name"""
990 return globals()[ie_name+'IE']