10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.rbmaradio import RBMARadioIE
45 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
46 from .extractor.stanfordoc import StanfordOpenClassroomIE
47 from .extractor.steam import SteamIE
48 from .extractor.ted import TEDIE
49 from .extractor.ustream import UstreamIE
50 from .extractor.vimeo import VimeoIE
51 from .extractor.worldstarhiphop import WorldStarHipHopIE
52 from .extractor.xnxx import XNXXIE
53 from .extractor.xvideos import XVideosIE
54 from .extractor.yahoo import YahooIE, YahooSearchIE
55 from .extractor.youku import YoukuIE
56 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
57 from .extractor.zdf import ZDFIE
78 class YouPornIE(InfoExtractor):
79 """Information extractor for youporn.com."""
80 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
82 def _print_formats(self, formats):
83 """Print all available formats"""
84 print(u'Available formats:')
85 print(u'ext\t\tformat')
86 print(u'---------------------------------')
87 for format in formats:
88 print(u'%s\t\t%s' % (format['ext'], format['format']))
90 def _specific(self, req_format, formats):
92 if(x["format"]==req_format):
96 def _real_extract(self, url):
97 mobj = re.match(self._VALID_URL, url)
99 raise ExtractorError(u'Invalid URL: %s' % url)
100 video_id = mobj.group('videoid')
102 req = compat_urllib_request.Request(url)
103 req.add_header('Cookie', 'age_verified=1')
104 webpage = self._download_webpage(req, video_id)
106 # Get JSON parameters
107 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
109 params = json.loads(json_params)
111 raise ExtractorError(u'Invalid JSON')
113 self.report_extraction(video_id)
115 video_title = params['title']
116 upload_date = unified_strdate(params['release_date_f'])
117 video_description = params['description']
118 video_uploader = params['submitted_by']
119 thumbnail = params['thumbnails'][0]['image']
121 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
123 # Get all of the formats available
124 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
125 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
126 webpage, u'download list').strip()
128 # Get all of the links from the page
129 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
130 links = re.findall(LINK_RE, download_list_html)
132 raise ExtractorError(u'ERROR: no known formats available for video')
134 self.to_screen(u'Links found: %d' % len(links))
139 # A link looks like this:
140 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
141 # A path looks like this:
142 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
143 video_url = unescapeHTML( link )
144 path = compat_urllib_parse_urlparse( video_url ).path
145 extension = os.path.splitext( path )[1][1:]
146 format = path.split('/')[4].split('_')[:2]
149 format = "-".join( format )
150 # title = u'%s-%s-%s' % (video_title, size, bitrate)
155 'uploader': video_uploader,
156 'upload_date': upload_date,
157 'title': video_title,
160 'thumbnail': thumbnail,
161 'description': video_description
164 if self._downloader.params.get('listformats', None):
165 self._print_formats(formats)
168 req_format = self._downloader.params.get('format', None)
169 self.to_screen(u'Format: %s' % req_format)
171 if req_format is None or req_format == 'best':
173 elif req_format == 'worst':
175 elif req_format in ('-1', 'all'):
178 format = self._specific( req_format, formats )
180 raise ExtractorError(u'Requested format not available')
185 class PornotubeIE(InfoExtractor):
186 """Information extractor for pornotube.com."""
187 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
189 def _real_extract(self, url):
190 mobj = re.match(self._VALID_URL, url)
192 raise ExtractorError(u'Invalid URL: %s' % url)
194 video_id = mobj.group('videoid')
195 video_title = mobj.group('title')
197 # Get webpage content
198 webpage = self._download_webpage(url, video_id)
201 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
202 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
203 video_url = compat_urllib_parse.unquote(video_url)
205 #Get the uploaded date
206 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
207 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
208 if upload_date: upload_date = unified_strdate(upload_date)
210 info = {'id': video_id,
213 'upload_date': upload_date,
214 'title': video_title,
220 class YouJizzIE(InfoExtractor):
221 """Information extractor for youjizz.com."""
222 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
224 def _real_extract(self, url):
225 mobj = re.match(self._VALID_URL, url)
227 raise ExtractorError(u'Invalid URL: %s' % url)
229 video_id = mobj.group('videoid')
231 # Get webpage content
232 webpage = self._download_webpage(url, video_id)
234 # Get the video title
235 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
236 webpage, u'title').strip()
239 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
241 raise ExtractorError(u'ERROR: unable to extract embed page')
243 embed_page_url = result.group(0).strip()
244 video_id = result.group('videoid')
246 webpage = self._download_webpage(embed_page_url, video_id)
249 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
250 webpage, u'video URL')
252 info = {'id': video_id,
254 'title': video_title,
257 'player_url': embed_page_url}
261 class EightTracksIE(InfoExtractor):
263 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
265 def _real_extract(self, url):
266 mobj = re.match(self._VALID_URL, url)
268 raise ExtractorError(u'Invalid URL: %s' % url)
269 playlist_id = mobj.group('id')
271 webpage = self._download_webpage(url, playlist_id)
273 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
274 data = json.loads(json_like)
276 session = str(random.randint(0, 1000000000))
278 track_count = data['tracks_count']
279 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
282 for i in itertools.count():
283 api_json = self._download_webpage(next_url, playlist_id,
284 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
285 errnote=u'Failed to download song information')
286 api_data = json.loads(api_json)
287 track_data = api_data[u'set']['track']
289 'id': track_data['id'],
290 'url': track_data['track_file_stream_url'],
291 'title': track_data['performer'] + u' - ' + track_data['name'],
292 'raw_title': track_data['name'],
293 'uploader_id': data['user']['login'],
297 if api_data['set']['at_last_track']:
299 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
302 class KeekIE(InfoExtractor):
303 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
306 def _real_extract(self, url):
307 m = re.match(self._VALID_URL, url)
308 video_id = m.group('videoID')
310 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
311 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
312 webpage = self._download_webpage(url, video_id)
314 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
317 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
318 webpage, u'uploader', fatal=False)
324 'title': video_title,
325 'thumbnail': thumbnail,
331 class MySpassIE(InfoExtractor):
332 _VALID_URL = r'http://www.myspass.de/.*'
334 def _real_extract(self, url):
335 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
337 # video id is the last path element of the URL
338 # usually there is a trailing slash, so also try the second but last
339 url_path = compat_urllib_parse_urlparse(url).path
340 url_parent_path, video_id = os.path.split(url_path)
342 _, video_id = os.path.split(url_parent_path)
345 metadata_url = META_DATA_URL_TEMPLATE % video_id
346 metadata_text = self._download_webpage(metadata_url, video_id)
347 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
349 # extract values from metadata
350 url_flv_el = metadata.find('url_flv')
351 if url_flv_el is None:
352 raise ExtractorError(u'Unable to extract download url')
353 video_url = url_flv_el.text
354 extension = os.path.splitext(video_url)[1][1:]
355 title_el = metadata.find('title')
357 raise ExtractorError(u'Unable to extract title')
358 title = title_el.text
359 format_id_el = metadata.find('format_id')
360 if format_id_el is None:
363 format = format_id_el.text
364 description_el = metadata.find('description')
365 if description_el is not None:
366 description = description_el.text
369 imagePreview_el = metadata.find('imagePreview')
370 if imagePreview_el is not None:
371 thumbnail = imagePreview_el.text
380 'thumbnail': thumbnail,
381 'description': description
385 class SpiegelIE(InfoExtractor):
386 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
388 def _real_extract(self, url):
389 m = re.match(self._VALID_URL, url)
390 video_id = m.group('videoID')
392 webpage = self._download_webpage(url, video_id)
394 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
397 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
398 xml_code = self._download_webpage(xml_url, video_id,
399 note=u'Downloading XML', errnote=u'Failed to download XML')
401 idoc = xml.etree.ElementTree.fromstring(xml_code)
403 filename = last_type.findall('./filename')[0].text
404 duration = float(last_type.findall('./duration')[0].text)
406 video_url = 'http://video2.spiegel.de/flash/' + filename
407 video_ext = filename.rpartition('.')[2]
412 'title': video_title,
413 'duration': duration,
417 class LiveLeakIE(InfoExtractor):
419 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
420 IE_NAME = u'liveleak'
422 def _real_extract(self, url):
423 mobj = re.match(self._VALID_URL, url)
425 raise ExtractorError(u'Invalid URL: %s' % url)
427 video_id = mobj.group('video_id')
429 webpage = self._download_webpage(url, video_id)
431 video_url = self._search_regex(r'file: "(.*?)",',
432 webpage, u'video URL')
434 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
435 webpage, u'title').replace('LiveLeak.com -', '').strip()
437 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
438 webpage, u'description', fatal=False)
440 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
441 webpage, u'uploader', fatal=False)
447 'title': video_title,
448 'description': video_description,
449 'uploader': video_uploader
456 class TumblrIE(InfoExtractor):
457 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
459 def _real_extract(self, url):
460 m_url = re.match(self._VALID_URL, url)
461 video_id = m_url.group('id')
462 blog = m_url.group('blog_name')
464 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
465 webpage = self._download_webpage(url, video_id)
467 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
468 video = re.search(re_video, webpage)
470 raise ExtractorError(u'Unable to extract video')
471 video_url = video.group('video_url')
472 ext = video.group('ext')
474 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
475 webpage, u'thumbnail', fatal=False) # We pick the first poster
476 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
478 # The only place where you can get a title, it's not complete,
479 # but searching in other places doesn't work for all videos
480 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
481 webpage, u'title', flags=re.DOTALL)
483 return [{'id': video_id,
485 'title': video_title,
486 'thumbnail': video_thumbnail,
490 class BandcampIE(InfoExtractor):
491 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
493 def _real_extract(self, url):
494 mobj = re.match(self._VALID_URL, url)
495 title = mobj.group('title')
496 webpage = self._download_webpage(url, title)
497 # We get the link to the free download page
498 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
499 if m_download is None:
500 raise ExtractorError(u'No free songs found')
502 download_link = m_download.group(1)
503 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
504 webpage, re.MULTILINE|re.DOTALL).group('id')
506 download_webpage = self._download_webpage(download_link, id,
507 'Downloading free downloads page')
508 # We get the dictionary of the track from some javascrip code
509 info = re.search(r'items: (.*?),$',
510 download_webpage, re.MULTILINE).group(1)
511 info = json.loads(info)[0]
512 # We pick mp3-320 for now, until format selection can be easily implemented.
513 mp3_info = info[u'downloads'][u'mp3-320']
514 # If we try to use this url it says the link has expired
515 initial_url = mp3_info[u'url']
516 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
517 m_url = re.match(re_url, initial_url)
518 #We build the url we will use to get the final track url
519 # This url is build in Bandcamp in the script download_bunde_*.js
520 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
521 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
522 # If we could correctly generate the .rand field the url would be
523 #in the "download_url" key
524 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
526 track_info = {'id':id,
527 'title' : info[u'title'],
530 'thumbnail' : info[u'thumb_url'],
531 'uploader' : info[u'artist']
536 class RedTubeIE(InfoExtractor):
537 """Information Extractor for redtube"""
538 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
540 def _real_extract(self,url):
541 mobj = re.match(self._VALID_URL, url)
543 raise ExtractorError(u'Invalid URL: %s' % url)
545 video_id = mobj.group('id')
546 video_extension = 'mp4'
547 webpage = self._download_webpage(url, video_id)
549 self.report_extraction(video_id)
551 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
552 webpage, u'video URL')
554 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
560 'ext': video_extension,
561 'title': video_title,
564 class InaIE(InfoExtractor):
565 """Information Extractor for Ina.fr"""
566 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
568 def _real_extract(self,url):
569 mobj = re.match(self._VALID_URL, url)
571 video_id = mobj.group('id')
572 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
573 video_extension = 'mp4'
574 webpage = self._download_webpage(mrss_url, video_id)
576 self.report_extraction(video_id)
578 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
579 webpage, u'video URL')
581 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
587 'ext': video_extension,
588 'title': video_title,
591 class HowcastIE(InfoExtractor):
592 """Information Extractor for Howcast.com"""
593 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
595 def _real_extract(self, url):
596 mobj = re.match(self._VALID_URL, url)
598 video_id = mobj.group('id')
599 webpage_url = 'http://www.howcast.com/videos/' + video_id
600 webpage = self._download_webpage(webpage_url, video_id)
602 self.report_extraction(video_id)
604 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
605 webpage, u'video URL')
607 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
610 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
611 webpage, u'description', fatal=False)
613 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
614 webpage, u'thumbnail', fatal=False)
620 'title': video_title,
621 'description': video_description,
622 'thumbnail': thumbnail,
625 class VineIE(InfoExtractor):
626 """Information Extractor for Vine.co"""
627 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
629 def _real_extract(self, url):
630 mobj = re.match(self._VALID_URL, url)
632 video_id = mobj.group('id')
633 webpage_url = 'https://vine.co/v/' + video_id
634 webpage = self._download_webpage(webpage_url, video_id)
636 self.report_extraction(video_id)
638 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
639 webpage, u'video URL')
641 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
644 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
645 webpage, u'thumbnail', fatal=False)
647 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
648 webpage, u'uploader', fatal=False, flags=re.DOTALL)
654 'title': video_title,
655 'thumbnail': thumbnail,
656 'uploader': uploader,
659 class FlickrIE(InfoExtractor):
660 """Information Extractor for Flickr videos"""
661 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
663 def _real_extract(self, url):
664 mobj = re.match(self._VALID_URL, url)
666 video_id = mobj.group('id')
667 video_uploader_id = mobj.group('uploader_id')
668 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
669 webpage = self._download_webpage(webpage_url, video_id)
671 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
673 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
674 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
676 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
677 first_xml, u'node_id')
679 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
680 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
682 self.report_extraction(video_id)
684 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
686 raise ExtractorError(u'Unable to extract video url')
687 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
689 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
690 webpage, u'video title')
692 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
693 webpage, u'description', fatal=False)
695 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
696 webpage, u'thumbnail', fatal=False)
702 'title': video_title,
703 'description': video_description,
704 'thumbnail': thumbnail,
705 'uploader_id': video_uploader_id,
708 class TeamcocoIE(InfoExtractor):
709 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
711 def _real_extract(self, url):
712 mobj = re.match(self._VALID_URL, url)
714 raise ExtractorError(u'Invalid URL: %s' % url)
715 url_title = mobj.group('url_title')
716 webpage = self._download_webpage(url, url_title)
718 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
719 webpage, u'video id')
721 self.report_extraction(video_id)
723 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
726 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
727 webpage, u'thumbnail', fatal=False)
729 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
730 webpage, u'description', fatal=False)
732 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
733 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
735 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
742 'title': video_title,
743 'thumbnail': thumbnail,
744 'description': video_description,
747 class XHamsterIE(InfoExtractor):
748 """Information Extractor for xHamster"""
749 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
751 def _real_extract(self,url):
752 mobj = re.match(self._VALID_URL, url)
754 video_id = mobj.group('id')
755 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
756 webpage = self._download_webpage(mrss_url, video_id)
758 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
760 raise ExtractorError(u'Unable to extract media URL')
761 if len(mobj.group('server')) == 0:
762 video_url = compat_urllib_parse.unquote(mobj.group('file'))
764 video_url = mobj.group('server')+'/key='+mobj.group('file')
765 video_extension = video_url.split('.')[-1]
767 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
770 # Can't see the description anywhere in the UI
771 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
772 # webpage, u'description', fatal=False)
773 # if video_description: video_description = unescapeHTML(video_description)
775 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
777 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
779 video_upload_date = None
780 self._downloader.report_warning(u'Unable to extract upload date')
782 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
783 webpage, u'uploader id', default=u'anonymous')
785 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
786 webpage, u'thumbnail', fatal=False)
791 'ext': video_extension,
792 'title': video_title,
793 # 'description': video_description,
794 'upload_date': video_upload_date,
795 'uploader_id': video_uploader_id,
796 'thumbnail': video_thumbnail
799 class HypemIE(InfoExtractor):
800 """Information Extractor for hypem"""
801 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
803 def _real_extract(self, url):
804 mobj = re.match(self._VALID_URL, url)
806 raise ExtractorError(u'Invalid URL: %s' % url)
807 track_id = mobj.group(1)
809 data = { 'ax': 1, 'ts': time.time() }
810 data_encoded = compat_urllib_parse.urlencode(data)
811 complete_url = url + "?" + data_encoded
812 request = compat_urllib_request.Request(complete_url)
813 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
814 cookie = urlh.headers.get('Set-Cookie', '')
816 self.report_extraction(track_id)
818 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
819 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
821 track_list = json.loads(html_tracks)
822 track = track_list[u'tracks'][0]
824 raise ExtractorError(u'Hypemachine contained invalid JSON.')
827 track_id = track[u"id"]
828 artist = track[u"artist"]
829 title = track[u"song"]
831 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
832 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
833 request.add_header('cookie', cookie)
834 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
836 song_data = json.loads(song_data_json)
838 raise ExtractorError(u'Hypemachine contained invalid JSON.')
839 final_url = song_data[u"url"]
849 class Vbox7IE(InfoExtractor):
850 """Information Extractor for Vbox7"""
851 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
853 def _real_extract(self,url):
854 mobj = re.match(self._VALID_URL, url)
856 raise ExtractorError(u'Invalid URL: %s' % url)
857 video_id = mobj.group(1)
859 redirect_page, urlh = self._download_webpage_handle(url, video_id)
860 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
861 redirect_url = urlh.geturl() + new_location
862 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
864 title = self._html_search_regex(r'<title>(.*)</title>',
865 webpage, u'title').split('/')[0].strip()
868 info_url = "http://vbox7.com/play/magare.do"
869 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
870 info_request = compat_urllib_request.Request(info_url, data)
871 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
872 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
873 if info_response is None:
874 raise ExtractorError(u'Unable to extract the media url')
875 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
882 'thumbnail': thumbnail_url,
886 def gen_extractors():
887 """ Return a list of an instance of every supported extractor.
888 The order does matter; the first extractor matched is the one handling the URL.
916 StanfordOpenClassroomIE(),
956 def get_info_extractor(ie_name):
957 """Returns the info extractor class with the given ie_name"""
958 return globals()[ie_name+'IE']