10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.funnyordie import FunnyOrDieIE
31 from .extractor.gametrailers import GametrailersIE
32 from .extractor.generic import GenericIE
33 from .extractor.googleplus import GooglePlusIE
34 from .extractor.googlesearch import GoogleSearchIE
35 from .extractor.infoq import InfoQIE
36 from .extractor.justintv import JustinTVIE
37 from .extractor.metacafe import MetacafeIE
38 from .extractor.mixcloud import MixcloudIE
39 from .extractor.mtv import MTVIE
40 from .extractor.myvideo import MyVideoIE
41 from .extractor.nba import NBAIE
42 from .extractor.statigram import StatigramIE
43 from .extractor.photobucket import PhotobucketIE
44 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
45 from .extractor.stanfordoc import StanfordOpenClassroomIE
46 from .extractor.steam import SteamIE
47 from .extractor.ted import TEDIE
48 from .extractor.vimeo import VimeoIE
49 from .extractor.worldstarhiphop import WorldStarHipHopIE
50 from .extractor.xnxx import XNXXIE
51 from .extractor.xvideos import XVideosIE
52 from .extractor.yahoo import YahooIE, YahooSearchIE
53 from .extractor.youku import YoukuIE
54 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
55 from .extractor.zdf import ZDFIE
72 class UstreamIE(InfoExtractor):
73 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
76 def _real_extract(self, url):
77 m = re.match(self._VALID_URL, url)
78 video_id = m.group('videoID')
80 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
81 webpage = self._download_webpage(url, video_id)
83 self.report_extraction(video_id)
85 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
88 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
89 webpage, u'uploader', fatal=False, flags=re.DOTALL)
91 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
92 webpage, u'thumbnail', fatal=False)
100 'thumbnail': thumbnail,
105 class RBMARadioIE(InfoExtractor):
106 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
108 def _real_extract(self, url):
109 m = re.match(self._VALID_URL, url)
110 video_id = m.group('videoID')
112 webpage = self._download_webpage(url, video_id)
114 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
115 webpage, u'json data', flags=re.MULTILINE)
118 data = json.loads(json_data)
119 except ValueError as e:
120 raise ExtractorError(u'Invalid JSON: ' + str(e))
122 video_url = data['akamai_url'] + '&cbr=256'
123 url_parts = compat_urllib_parse_urlparse(video_url)
124 video_ext = url_parts.path.rpartition('.')[2]
129 'title': data['title'],
130 'description': data.get('teaser_text'),
131 'location': data.get('country_of_origin'),
132 'uploader': data.get('host', {}).get('name'),
133 'uploader_id': data.get('host', {}).get('slug'),
134 'thumbnail': data.get('image', {}).get('large_url_2x'),
135 'duration': data.get('duration'),
140 class YouPornIE(InfoExtractor):
141 """Information extractor for youporn.com."""
142 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
144 def _print_formats(self, formats):
145 """Print all available formats"""
146 print(u'Available formats:')
147 print(u'ext\t\tformat')
148 print(u'---------------------------------')
149 for format in formats:
150 print(u'%s\t\t%s' % (format['ext'], format['format']))
152 def _specific(self, req_format, formats):
154 if(x["format"]==req_format):
158 def _real_extract(self, url):
159 mobj = re.match(self._VALID_URL, url)
161 raise ExtractorError(u'Invalid URL: %s' % url)
162 video_id = mobj.group('videoid')
164 req = compat_urllib_request.Request(url)
165 req.add_header('Cookie', 'age_verified=1')
166 webpage = self._download_webpage(req, video_id)
168 # Get JSON parameters
169 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
171 params = json.loads(json_params)
173 raise ExtractorError(u'Invalid JSON')
175 self.report_extraction(video_id)
177 video_title = params['title']
178 upload_date = unified_strdate(params['release_date_f'])
179 video_description = params['description']
180 video_uploader = params['submitted_by']
181 thumbnail = params['thumbnails'][0]['image']
183 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
185 # Get all of the formats available
186 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
187 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
188 webpage, u'download list').strip()
190 # Get all of the links from the page
191 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
192 links = re.findall(LINK_RE, download_list_html)
194 raise ExtractorError(u'ERROR: no known formats available for video')
196 self.to_screen(u'Links found: %d' % len(links))
201 # A link looks like this:
202 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
203 # A path looks like this:
204 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
205 video_url = unescapeHTML( link )
206 path = compat_urllib_parse_urlparse( video_url ).path
207 extension = os.path.splitext( path )[1][1:]
208 format = path.split('/')[4].split('_')[:2]
211 format = "-".join( format )
212 # title = u'%s-%s-%s' % (video_title, size, bitrate)
217 'uploader': video_uploader,
218 'upload_date': upload_date,
219 'title': video_title,
222 'thumbnail': thumbnail,
223 'description': video_description
226 if self._downloader.params.get('listformats', None):
227 self._print_formats(formats)
230 req_format = self._downloader.params.get('format', None)
231 self.to_screen(u'Format: %s' % req_format)
233 if req_format is None or req_format == 'best':
235 elif req_format == 'worst':
237 elif req_format in ('-1', 'all'):
240 format = self._specific( req_format, formats )
242 raise ExtractorError(u'Requested format not available')
247 class PornotubeIE(InfoExtractor):
248 """Information extractor for pornotube.com."""
249 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
251 def _real_extract(self, url):
252 mobj = re.match(self._VALID_URL, url)
254 raise ExtractorError(u'Invalid URL: %s' % url)
256 video_id = mobj.group('videoid')
257 video_title = mobj.group('title')
259 # Get webpage content
260 webpage = self._download_webpage(url, video_id)
263 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
264 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
265 video_url = compat_urllib_parse.unquote(video_url)
267 #Get the uploaded date
268 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
269 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
270 if upload_date: upload_date = unified_strdate(upload_date)
272 info = {'id': video_id,
275 'upload_date': upload_date,
276 'title': video_title,
282 class YouJizzIE(InfoExtractor):
283 """Information extractor for youjizz.com."""
284 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
286 def _real_extract(self, url):
287 mobj = re.match(self._VALID_URL, url)
289 raise ExtractorError(u'Invalid URL: %s' % url)
291 video_id = mobj.group('videoid')
293 # Get webpage content
294 webpage = self._download_webpage(url, video_id)
296 # Get the video title
297 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
298 webpage, u'title').strip()
301 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
303 raise ExtractorError(u'ERROR: unable to extract embed page')
305 embed_page_url = result.group(0).strip()
306 video_id = result.group('videoid')
308 webpage = self._download_webpage(embed_page_url, video_id)
311 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
312 webpage, u'video URL')
314 info = {'id': video_id,
316 'title': video_title,
319 'player_url': embed_page_url}
323 class EightTracksIE(InfoExtractor):
325 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
327 def _real_extract(self, url):
328 mobj = re.match(self._VALID_URL, url)
330 raise ExtractorError(u'Invalid URL: %s' % url)
331 playlist_id = mobj.group('id')
333 webpage = self._download_webpage(url, playlist_id)
335 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
336 data = json.loads(json_like)
338 session = str(random.randint(0, 1000000000))
340 track_count = data['tracks_count']
341 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
344 for i in itertools.count():
345 api_json = self._download_webpage(next_url, playlist_id,
346 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
347 errnote=u'Failed to download song information')
348 api_data = json.loads(api_json)
349 track_data = api_data[u'set']['track']
351 'id': track_data['id'],
352 'url': track_data['track_file_stream_url'],
353 'title': track_data['performer'] + u' - ' + track_data['name'],
354 'raw_title': track_data['name'],
355 'uploader_id': data['user']['login'],
359 if api_data['set']['at_last_track']:
361 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
364 class KeekIE(InfoExtractor):
365 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
368 def _real_extract(self, url):
369 m = re.match(self._VALID_URL, url)
370 video_id = m.group('videoID')
372 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
373 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
374 webpage = self._download_webpage(url, video_id)
376 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
379 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
380 webpage, u'uploader', fatal=False)
386 'title': video_title,
387 'thumbnail': thumbnail,
393 class MySpassIE(InfoExtractor):
394 _VALID_URL = r'http://www.myspass.de/.*'
396 def _real_extract(self, url):
397 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
399 # video id is the last path element of the URL
400 # usually there is a trailing slash, so also try the second but last
401 url_path = compat_urllib_parse_urlparse(url).path
402 url_parent_path, video_id = os.path.split(url_path)
404 _, video_id = os.path.split(url_parent_path)
407 metadata_url = META_DATA_URL_TEMPLATE % video_id
408 metadata_text = self._download_webpage(metadata_url, video_id)
409 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
411 # extract values from metadata
412 url_flv_el = metadata.find('url_flv')
413 if url_flv_el is None:
414 raise ExtractorError(u'Unable to extract download url')
415 video_url = url_flv_el.text
416 extension = os.path.splitext(video_url)[1][1:]
417 title_el = metadata.find('title')
419 raise ExtractorError(u'Unable to extract title')
420 title = title_el.text
421 format_id_el = metadata.find('format_id')
422 if format_id_el is None:
425 format = format_id_el.text
426 description_el = metadata.find('description')
427 if description_el is not None:
428 description = description_el.text
431 imagePreview_el = metadata.find('imagePreview')
432 if imagePreview_el is not None:
433 thumbnail = imagePreview_el.text
442 'thumbnail': thumbnail,
443 'description': description
447 class SpiegelIE(InfoExtractor):
448 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
450 def _real_extract(self, url):
451 m = re.match(self._VALID_URL, url)
452 video_id = m.group('videoID')
454 webpage = self._download_webpage(url, video_id)
456 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
459 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
460 xml_code = self._download_webpage(xml_url, video_id,
461 note=u'Downloading XML', errnote=u'Failed to download XML')
463 idoc = xml.etree.ElementTree.fromstring(xml_code)
465 filename = last_type.findall('./filename')[0].text
466 duration = float(last_type.findall('./duration')[0].text)
468 video_url = 'http://video2.spiegel.de/flash/' + filename
469 video_ext = filename.rpartition('.')[2]
474 'title': video_title,
475 'duration': duration,
479 class LiveLeakIE(InfoExtractor):
481 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
482 IE_NAME = u'liveleak'
484 def _real_extract(self, url):
485 mobj = re.match(self._VALID_URL, url)
487 raise ExtractorError(u'Invalid URL: %s' % url)
489 video_id = mobj.group('video_id')
491 webpage = self._download_webpage(url, video_id)
493 video_url = self._search_regex(r'file: "(.*?)",',
494 webpage, u'video URL')
496 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
497 webpage, u'title').replace('LiveLeak.com -', '').strip()
499 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
500 webpage, u'description', fatal=False)
502 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
503 webpage, u'uploader', fatal=False)
509 'title': video_title,
510 'description': video_description,
511 'uploader': video_uploader
518 class TumblrIE(InfoExtractor):
519 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
521 def _real_extract(self, url):
522 m_url = re.match(self._VALID_URL, url)
523 video_id = m_url.group('id')
524 blog = m_url.group('blog_name')
526 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
527 webpage = self._download_webpage(url, video_id)
529 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
530 video = re.search(re_video, webpage)
532 raise ExtractorError(u'Unable to extract video')
533 video_url = video.group('video_url')
534 ext = video.group('ext')
536 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
537 webpage, u'thumbnail', fatal=False) # We pick the first poster
538 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
540 # The only place where you can get a title, it's not complete,
541 # but searching in other places doesn't work for all videos
542 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
543 webpage, u'title', flags=re.DOTALL)
545 return [{'id': video_id,
547 'title': video_title,
548 'thumbnail': video_thumbnail,
552 class BandcampIE(InfoExtractor):
553 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
555 def _real_extract(self, url):
556 mobj = re.match(self._VALID_URL, url)
557 title = mobj.group('title')
558 webpage = self._download_webpage(url, title)
559 # We get the link to the free download page
560 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
561 if m_download is None:
562 raise ExtractorError(u'No free songs found')
564 download_link = m_download.group(1)
565 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
566 webpage, re.MULTILINE|re.DOTALL).group('id')
568 download_webpage = self._download_webpage(download_link, id,
569 'Downloading free downloads page')
570 # We get the dictionary of the track from some javascrip code
571 info = re.search(r'items: (.*?),$',
572 download_webpage, re.MULTILINE).group(1)
573 info = json.loads(info)[0]
574 # We pick mp3-320 for now, until format selection can be easily implemented.
575 mp3_info = info[u'downloads'][u'mp3-320']
576 # If we try to use this url it says the link has expired
577 initial_url = mp3_info[u'url']
578 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
579 m_url = re.match(re_url, initial_url)
580 #We build the url we will use to get the final track url
581 # This url is build in Bandcamp in the script download_bunde_*.js
582 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
583 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
584 # If we could correctly generate the .rand field the url would be
585 #in the "download_url" key
586 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
588 track_info = {'id':id,
589 'title' : info[u'title'],
592 'thumbnail' : info[u'thumb_url'],
593 'uploader' : info[u'artist']
598 class RedTubeIE(InfoExtractor):
599 """Information Extractor for redtube"""
600 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
602 def _real_extract(self,url):
603 mobj = re.match(self._VALID_URL, url)
605 raise ExtractorError(u'Invalid URL: %s' % url)
607 video_id = mobj.group('id')
608 video_extension = 'mp4'
609 webpage = self._download_webpage(url, video_id)
611 self.report_extraction(video_id)
613 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
614 webpage, u'video URL')
616 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
622 'ext': video_extension,
623 'title': video_title,
626 class InaIE(InfoExtractor):
627 """Information Extractor for Ina.fr"""
628 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
630 def _real_extract(self,url):
631 mobj = re.match(self._VALID_URL, url)
633 video_id = mobj.group('id')
634 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
635 video_extension = 'mp4'
636 webpage = self._download_webpage(mrss_url, video_id)
638 self.report_extraction(video_id)
640 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
641 webpage, u'video URL')
643 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
649 'ext': video_extension,
650 'title': video_title,
653 class HowcastIE(InfoExtractor):
654 """Information Extractor for Howcast.com"""
655 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
657 def _real_extract(self, url):
658 mobj = re.match(self._VALID_URL, url)
660 video_id = mobj.group('id')
661 webpage_url = 'http://www.howcast.com/videos/' + video_id
662 webpage = self._download_webpage(webpage_url, video_id)
664 self.report_extraction(video_id)
666 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
667 webpage, u'video URL')
669 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
672 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
673 webpage, u'description', fatal=False)
675 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
676 webpage, u'thumbnail', fatal=False)
682 'title': video_title,
683 'description': video_description,
684 'thumbnail': thumbnail,
687 class VineIE(InfoExtractor):
688 """Information Extractor for Vine.co"""
689 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
691 def _real_extract(self, url):
692 mobj = re.match(self._VALID_URL, url)
694 video_id = mobj.group('id')
695 webpage_url = 'https://vine.co/v/' + video_id
696 webpage = self._download_webpage(webpage_url, video_id)
698 self.report_extraction(video_id)
700 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
701 webpage, u'video URL')
703 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
706 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
707 webpage, u'thumbnail', fatal=False)
709 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
710 webpage, u'uploader', fatal=False, flags=re.DOTALL)
716 'title': video_title,
717 'thumbnail': thumbnail,
718 'uploader': uploader,
721 class FlickrIE(InfoExtractor):
722 """Information Extractor for Flickr videos"""
723 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
725 def _real_extract(self, url):
726 mobj = re.match(self._VALID_URL, url)
728 video_id = mobj.group('id')
729 video_uploader_id = mobj.group('uploader_id')
730 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
731 webpage = self._download_webpage(webpage_url, video_id)
733 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
735 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
736 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
738 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
739 first_xml, u'node_id')
741 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
742 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
744 self.report_extraction(video_id)
746 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
748 raise ExtractorError(u'Unable to extract video url')
749 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
751 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
752 webpage, u'video title')
754 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
755 webpage, u'description', fatal=False)
757 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
758 webpage, u'thumbnail', fatal=False)
764 'title': video_title,
765 'description': video_description,
766 'thumbnail': thumbnail,
767 'uploader_id': video_uploader_id,
770 class TeamcocoIE(InfoExtractor):
771 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
773 def _real_extract(self, url):
774 mobj = re.match(self._VALID_URL, url)
776 raise ExtractorError(u'Invalid URL: %s' % url)
777 url_title = mobj.group('url_title')
778 webpage = self._download_webpage(url, url_title)
780 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
781 webpage, u'video id')
783 self.report_extraction(video_id)
785 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
788 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
789 webpage, u'thumbnail', fatal=False)
791 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
792 webpage, u'description', fatal=False)
794 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
795 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
797 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
804 'title': video_title,
805 'thumbnail': thumbnail,
806 'description': video_description,
809 class XHamsterIE(InfoExtractor):
810 """Information Extractor for xHamster"""
811 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
813 def _real_extract(self,url):
814 mobj = re.match(self._VALID_URL, url)
816 video_id = mobj.group('id')
817 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
818 webpage = self._download_webpage(mrss_url, video_id)
820 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
822 raise ExtractorError(u'Unable to extract media URL')
823 if len(mobj.group('server')) == 0:
824 video_url = compat_urllib_parse.unquote(mobj.group('file'))
826 video_url = mobj.group('server')+'/key='+mobj.group('file')
827 video_extension = video_url.split('.')[-1]
829 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
832 # Can't see the description anywhere in the UI
833 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
834 # webpage, u'description', fatal=False)
835 # if video_description: video_description = unescapeHTML(video_description)
837 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
839 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
841 video_upload_date = None
842 self._downloader.report_warning(u'Unable to extract upload date')
844 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
845 webpage, u'uploader id', default=u'anonymous')
847 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
848 webpage, u'thumbnail', fatal=False)
853 'ext': video_extension,
854 'title': video_title,
855 # 'description': video_description,
856 'upload_date': video_upload_date,
857 'uploader_id': video_uploader_id,
858 'thumbnail': video_thumbnail
861 class HypemIE(InfoExtractor):
862 """Information Extractor for hypem"""
863 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
865 def _real_extract(self, url):
866 mobj = re.match(self._VALID_URL, url)
868 raise ExtractorError(u'Invalid URL: %s' % url)
869 track_id = mobj.group(1)
871 data = { 'ax': 1, 'ts': time.time() }
872 data_encoded = compat_urllib_parse.urlencode(data)
873 complete_url = url + "?" + data_encoded
874 request = compat_urllib_request.Request(complete_url)
875 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
876 cookie = urlh.headers.get('Set-Cookie', '')
878 self.report_extraction(track_id)
880 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
881 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
883 track_list = json.loads(html_tracks)
884 track = track_list[u'tracks'][0]
886 raise ExtractorError(u'Hypemachine contained invalid JSON.')
889 track_id = track[u"id"]
890 artist = track[u"artist"]
891 title = track[u"song"]
893 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
894 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
895 request.add_header('cookie', cookie)
896 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
898 song_data = json.loads(song_data_json)
900 raise ExtractorError(u'Hypemachine contained invalid JSON.')
901 final_url = song_data[u"url"]
911 class Vbox7IE(InfoExtractor):
912 """Information Extractor for Vbox7"""
913 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
915 def _real_extract(self,url):
916 mobj = re.match(self._VALID_URL, url)
918 raise ExtractorError(u'Invalid URL: %s' % url)
919 video_id = mobj.group(1)
921 redirect_page, urlh = self._download_webpage_handle(url, video_id)
922 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
923 redirect_url = urlh.geturl() + new_location
924 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
926 title = self._html_search_regex(r'<title>(.*)</title>',
927 webpage, u'title').split('/')[0].strip()
930 info_url = "http://vbox7.com/play/magare.do"
931 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
932 info_request = compat_urllib_request.Request(info_url, data)
933 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
934 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
935 if info_response is None:
936 raise ExtractorError(u'Unable to extract the media url')
937 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
944 'thumbnail': thumbnail_url,
948 def gen_extractors():
949 """ Return a list of an instance of every supported extractor.
950 The order does matter; the first extractor matched is the one handling the URL.
978 StanfordOpenClassroomIE(),
1018 def get_info_extractor(ie_name):
1019 """Returns the info extractor class with the given ie_name"""
1020 return globals()[ie_name+'IE']