10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.ted import TEDIE
45 from .extractor.vimeo import VimeoIE
46 from .extractor.xvideos import XVideosIE
47 from .extractor.yahoo import YahooIE, YahooSearchIE
48 from .extractor.youku import YoukuIE
49 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
50 from .extractor.zdf import ZDFIE
60 class XNXXIE(InfoExtractor):
61 """Information extractor for xnxx.com"""
63 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
65 VIDEO_URL_RE = r'flv_url=(.*?)&'
66 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
67 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
69 def _real_extract(self, url):
70 mobj = re.match(self._VALID_URL, url)
72 raise ExtractorError(u'Invalid URL: %s' % url)
73 video_id = mobj.group(1)
76 webpage = self._download_webpage(url, video_id)
78 video_url = self._search_regex(self.VIDEO_URL_RE,
79 webpage, u'video URL')
80 video_url = compat_urllib_parse.unquote(video_url)
82 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
85 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
86 webpage, u'thumbnail', fatal=False)
95 'thumbnail': video_thumbnail,
102 class JustinTVIE(InfoExtractor):
103 """Information extractor for justin.tv and twitch.tv"""
104 # TODO: One broadcast may be split into multiple videos. The key
105 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
106 # starts at 1 and increases. Can we treat all parts as one video?
108 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
110 (?P<channelid>[^/]+)|
111 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
112 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
116 _JUSTIN_PAGE_LIMIT = 100
117 IE_NAME = u'justin.tv'
119 def report_download_page(self, channel, offset):
120 """Report attempt to download a single page of videos."""
121 self.to_screen(u'%s: Downloading video information from %d to %d' %
122 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
124 # Return count of items, list of *valid* items
125 def _parse_page(self, url, video_id):
126 webpage = self._download_webpage(url, video_id,
127 u'Downloading video info JSON',
128 u'unable to download video info JSON')
130 response = json.loads(webpage)
131 if type(response) != list:
132 error_text = response.get('error', 'unknown error')
133 raise ExtractorError(u'Justin.tv API: %s' % error_text)
135 for clip in response:
136 video_url = clip['video_file_url']
138 video_extension = os.path.splitext(video_url)[1][1:]
139 video_date = re.sub('-', '', clip['start_time'][:10])
140 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
141 video_id = clip['id']
142 video_title = clip.get('title', video_id)
146 'title': video_title,
147 'uploader': clip.get('channel_name', video_uploader_id),
148 'uploader_id': video_uploader_id,
149 'upload_date': video_date,
150 'ext': video_extension,
152 return (len(response), info)
154 def _real_extract(self, url):
155 mobj = re.match(self._VALID_URL, url)
157 raise ExtractorError(u'invalid URL: %s' % url)
159 api_base = 'http://api.justin.tv'
161 if mobj.group('channelid'):
163 video_id = mobj.group('channelid')
164 api = api_base + '/channel/archives/%s.json' % video_id
165 elif mobj.group('chapterid'):
166 chapter_id = mobj.group('chapterid')
168 webpage = self._download_webpage(url, chapter_id)
169 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
171 raise ExtractorError(u'Cannot find archive of a chapter')
172 archive_id = m.group(1)
174 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
175 chapter_info_xml = self._download_webpage(api, chapter_id,
176 note=u'Downloading chapter information',
177 errnote=u'Chapter information download failed')
178 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
179 for a in doc.findall('.//archive'):
180 if archive_id == a.find('./id').text:
183 raise ExtractorError(u'Could not find chapter in chapter information')
185 video_url = a.find('./video_file_url').text
186 video_ext = video_url.rpartition('.')[2] or u'flv'
188 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
189 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
190 note='Downloading chapter metadata',
191 errnote='Download of chapter metadata failed')
192 chapter_info = json.loads(chapter_info_json)
194 bracket_start = int(doc.find('.//bracket_start').text)
195 bracket_end = int(doc.find('.//bracket_end').text)
197 # TODO determine start (and probably fix up file)
198 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
199 #video_url += u'?start=' + TODO:start_timestamp
200 # bracket_start is 13290, but we want 51670615
201 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
202 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
205 'id': u'c' + chapter_id,
208 'title': chapter_info['title'],
209 'thumbnail': chapter_info['preview'],
210 'description': chapter_info['description'],
211 'uploader': chapter_info['channel']['display_name'],
212 'uploader_id': chapter_info['channel']['name'],
216 video_id = mobj.group('videoid')
217 api = api_base + '/broadcast/by_archive/%s.json' % video_id
219 self.report_extraction(video_id)
223 limit = self._JUSTIN_PAGE_LIMIT
226 self.report_download_page(video_id, offset)
227 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
228 page_count, page_info = self._parse_page(page_url, video_id)
229 info.extend(page_info)
230 if not paged or page_count != limit:
235 class FunnyOrDieIE(InfoExtractor):
236 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
238 def _real_extract(self, url):
239 mobj = re.match(self._VALID_URL, url)
241 raise ExtractorError(u'invalid URL: %s' % url)
243 video_id = mobj.group('id')
244 webpage = self._download_webpage(url, video_id)
246 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
247 webpage, u'video URL', flags=re.DOTALL)
249 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
250 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
252 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
253 webpage, u'description', fatal=False, flags=re.DOTALL)
260 'description': video_description,
264 class SteamIE(InfoExtractor):
265 _VALID_URL = r"""http://store\.steampowered\.com/
267 (?P<urltype>video|app)/ #If the page is only for videos or for a game
269 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
271 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
272 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
275 def suitable(cls, url):
276 """Receives a URL and returns True if suitable for this IE."""
277 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
279 def _real_extract(self, url):
280 m = re.match(self._VALID_URL, url, re.VERBOSE)
281 gameID = m.group('gameID')
283 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
284 webpage = self._download_webpage(videourl, gameID)
286 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
287 videourl = self._AGECHECK_TEMPLATE % gameID
288 self.report_age_confirmation()
289 webpage = self._download_webpage(videourl, gameID)
291 self.report_extraction(gameID)
292 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
293 webpage, 'game title')
295 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
296 mweb = re.finditer(urlRE, webpage)
297 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
298 titles = re.finditer(namesRE, webpage)
299 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
300 thumbs = re.finditer(thumbsRE, webpage)
302 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
303 video_id = vid.group('videoID')
304 title = vtitle.group('videoName')
305 video_url = vid.group('videoURL')
306 video_thumb = thumb.group('thumbnail')
308 raise ExtractorError(u'Cannot find video url for %s' % video_id)
313 'title': unescapeHTML(title),
314 'thumbnail': video_thumb
317 return [self.playlist_result(videos, gameID, game_title)]
319 class UstreamIE(InfoExtractor):
320 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
323 def _real_extract(self, url):
324 m = re.match(self._VALID_URL, url)
325 video_id = m.group('videoID')
327 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
328 webpage = self._download_webpage(url, video_id)
330 self.report_extraction(video_id)
332 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
335 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
336 webpage, u'uploader', fatal=False, flags=re.DOTALL)
338 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
339 webpage, u'thumbnail', fatal=False)
345 'title': video_title,
346 'uploader': uploader,
347 'thumbnail': thumbnail,
351 class WorldStarHipHopIE(InfoExtractor):
352 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
353 IE_NAME = u'WorldStarHipHop'
355 def _real_extract(self, url):
356 m = re.match(self._VALID_URL, url)
357 video_id = m.group('id')
359 webpage_src = self._download_webpage(url, video_id)
361 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
362 webpage_src, u'video URL')
364 if 'mp4' in video_url:
369 video_title = self._html_search_regex(r"<title>(.*)</title>",
370 webpage_src, u'title')
372 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
373 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
374 webpage_src, u'thumbnail', fatal=False)
377 _title = r"""candytitles.*>(.*)</span>"""
378 mobj = re.search(_title, webpage_src)
380 video_title = mobj.group(1)
385 'title' : video_title,
386 'thumbnail' : thumbnail,
391 class RBMARadioIE(InfoExtractor):
392 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
394 def _real_extract(self, url):
395 m = re.match(self._VALID_URL, url)
396 video_id = m.group('videoID')
398 webpage = self._download_webpage(url, video_id)
400 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
401 webpage, u'json data', flags=re.MULTILINE)
404 data = json.loads(json_data)
405 except ValueError as e:
406 raise ExtractorError(u'Invalid JSON: ' + str(e))
408 video_url = data['akamai_url'] + '&cbr=256'
409 url_parts = compat_urllib_parse_urlparse(video_url)
410 video_ext = url_parts.path.rpartition('.')[2]
415 'title': data['title'],
416 'description': data.get('teaser_text'),
417 'location': data.get('country_of_origin'),
418 'uploader': data.get('host', {}).get('name'),
419 'uploader_id': data.get('host', {}).get('slug'),
420 'thumbnail': data.get('image', {}).get('large_url_2x'),
421 'duration': data.get('duration'),
426 class YouPornIE(InfoExtractor):
427 """Information extractor for youporn.com."""
428 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
430 def _print_formats(self, formats):
431 """Print all available formats"""
432 print(u'Available formats:')
433 print(u'ext\t\tformat')
434 print(u'---------------------------------')
435 for format in formats:
436 print(u'%s\t\t%s' % (format['ext'], format['format']))
438 def _specific(self, req_format, formats):
440 if(x["format"]==req_format):
444 def _real_extract(self, url):
445 mobj = re.match(self._VALID_URL, url)
447 raise ExtractorError(u'Invalid URL: %s' % url)
448 video_id = mobj.group('videoid')
450 req = compat_urllib_request.Request(url)
451 req.add_header('Cookie', 'age_verified=1')
452 webpage = self._download_webpage(req, video_id)
454 # Get JSON parameters
455 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
457 params = json.loads(json_params)
459 raise ExtractorError(u'Invalid JSON')
461 self.report_extraction(video_id)
463 video_title = params['title']
464 upload_date = unified_strdate(params['release_date_f'])
465 video_description = params['description']
466 video_uploader = params['submitted_by']
467 thumbnail = params['thumbnails'][0]['image']
469 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
471 # Get all of the formats available
472 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
473 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
474 webpage, u'download list').strip()
476 # Get all of the links from the page
477 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
478 links = re.findall(LINK_RE, download_list_html)
480 raise ExtractorError(u'ERROR: no known formats available for video')
482 self.to_screen(u'Links found: %d' % len(links))
487 # A link looks like this:
488 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
489 # A path looks like this:
490 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
491 video_url = unescapeHTML( link )
492 path = compat_urllib_parse_urlparse( video_url ).path
493 extension = os.path.splitext( path )[1][1:]
494 format = path.split('/')[4].split('_')[:2]
497 format = "-".join( format )
498 # title = u'%s-%s-%s' % (video_title, size, bitrate)
503 'uploader': video_uploader,
504 'upload_date': upload_date,
505 'title': video_title,
508 'thumbnail': thumbnail,
509 'description': video_description
512 if self._downloader.params.get('listformats', None):
513 self._print_formats(formats)
516 req_format = self._downloader.params.get('format', None)
517 self.to_screen(u'Format: %s' % req_format)
519 if req_format is None or req_format == 'best':
521 elif req_format == 'worst':
523 elif req_format in ('-1', 'all'):
526 format = self._specific( req_format, formats )
528 raise ExtractorError(u'Requested format not available')
533 class PornotubeIE(InfoExtractor):
534 """Information extractor for pornotube.com."""
535 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
537 def _real_extract(self, url):
538 mobj = re.match(self._VALID_URL, url)
540 raise ExtractorError(u'Invalid URL: %s' % url)
542 video_id = mobj.group('videoid')
543 video_title = mobj.group('title')
545 # Get webpage content
546 webpage = self._download_webpage(url, video_id)
549 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
550 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
551 video_url = compat_urllib_parse.unquote(video_url)
553 #Get the uploaded date
554 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
555 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
556 if upload_date: upload_date = unified_strdate(upload_date)
558 info = {'id': video_id,
561 'upload_date': upload_date,
562 'title': video_title,
568 class YouJizzIE(InfoExtractor):
569 """Information extractor for youjizz.com."""
570 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
572 def _real_extract(self, url):
573 mobj = re.match(self._VALID_URL, url)
575 raise ExtractorError(u'Invalid URL: %s' % url)
577 video_id = mobj.group('videoid')
579 # Get webpage content
580 webpage = self._download_webpage(url, video_id)
582 # Get the video title
583 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
584 webpage, u'title').strip()
587 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
589 raise ExtractorError(u'ERROR: unable to extract embed page')
591 embed_page_url = result.group(0).strip()
592 video_id = result.group('videoid')
594 webpage = self._download_webpage(embed_page_url, video_id)
597 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
598 webpage, u'video URL')
600 info = {'id': video_id,
602 'title': video_title,
605 'player_url': embed_page_url}
609 class EightTracksIE(InfoExtractor):
611 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
613 def _real_extract(self, url):
614 mobj = re.match(self._VALID_URL, url)
616 raise ExtractorError(u'Invalid URL: %s' % url)
617 playlist_id = mobj.group('id')
619 webpage = self._download_webpage(url, playlist_id)
621 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
622 data = json.loads(json_like)
624 session = str(random.randint(0, 1000000000))
626 track_count = data['tracks_count']
627 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
630 for i in itertools.count():
631 api_json = self._download_webpage(next_url, playlist_id,
632 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
633 errnote=u'Failed to download song information')
634 api_data = json.loads(api_json)
635 track_data = api_data[u'set']['track']
637 'id': track_data['id'],
638 'url': track_data['track_file_stream_url'],
639 'title': track_data['performer'] + u' - ' + track_data['name'],
640 'raw_title': track_data['name'],
641 'uploader_id': data['user']['login'],
645 if api_data['set']['at_last_track']:
647 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
650 class KeekIE(InfoExtractor):
651 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
654 def _real_extract(self, url):
655 m = re.match(self._VALID_URL, url)
656 video_id = m.group('videoID')
658 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
659 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
660 webpage = self._download_webpage(url, video_id)
662 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
665 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
666 webpage, u'uploader', fatal=False)
672 'title': video_title,
673 'thumbnail': thumbnail,
679 class MySpassIE(InfoExtractor):
680 _VALID_URL = r'http://www.myspass.de/.*'
682 def _real_extract(self, url):
683 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
685 # video id is the last path element of the URL
686 # usually there is a trailing slash, so also try the second but last
687 url_path = compat_urllib_parse_urlparse(url).path
688 url_parent_path, video_id = os.path.split(url_path)
690 _, video_id = os.path.split(url_parent_path)
693 metadata_url = META_DATA_URL_TEMPLATE % video_id
694 metadata_text = self._download_webpage(metadata_url, video_id)
695 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
697 # extract values from metadata
698 url_flv_el = metadata.find('url_flv')
699 if url_flv_el is None:
700 raise ExtractorError(u'Unable to extract download url')
701 video_url = url_flv_el.text
702 extension = os.path.splitext(video_url)[1][1:]
703 title_el = metadata.find('title')
705 raise ExtractorError(u'Unable to extract title')
706 title = title_el.text
707 format_id_el = metadata.find('format_id')
708 if format_id_el is None:
711 format = format_id_el.text
712 description_el = metadata.find('description')
713 if description_el is not None:
714 description = description_el.text
717 imagePreview_el = metadata.find('imagePreview')
718 if imagePreview_el is not None:
719 thumbnail = imagePreview_el.text
728 'thumbnail': thumbnail,
729 'description': description
733 class SpiegelIE(InfoExtractor):
734 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
736 def _real_extract(self, url):
737 m = re.match(self._VALID_URL, url)
738 video_id = m.group('videoID')
740 webpage = self._download_webpage(url, video_id)
742 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
745 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
746 xml_code = self._download_webpage(xml_url, video_id,
747 note=u'Downloading XML', errnote=u'Failed to download XML')
749 idoc = xml.etree.ElementTree.fromstring(xml_code)
751 filename = last_type.findall('./filename')[0].text
752 duration = float(last_type.findall('./duration')[0].text)
754 video_url = 'http://video2.spiegel.de/flash/' + filename
755 video_ext = filename.rpartition('.')[2]
760 'title': video_title,
761 'duration': duration,
765 class LiveLeakIE(InfoExtractor):
767 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
768 IE_NAME = u'liveleak'
770 def _real_extract(self, url):
771 mobj = re.match(self._VALID_URL, url)
773 raise ExtractorError(u'Invalid URL: %s' % url)
775 video_id = mobj.group('video_id')
777 webpage = self._download_webpage(url, video_id)
779 video_url = self._search_regex(r'file: "(.*?)",',
780 webpage, u'video URL')
782 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
783 webpage, u'title').replace('LiveLeak.com -', '').strip()
785 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
786 webpage, u'description', fatal=False)
788 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
789 webpage, u'uploader', fatal=False)
795 'title': video_title,
796 'description': video_description,
797 'uploader': video_uploader
804 class TumblrIE(InfoExtractor):
805 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
807 def _real_extract(self, url):
808 m_url = re.match(self._VALID_URL, url)
809 video_id = m_url.group('id')
810 blog = m_url.group('blog_name')
812 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
813 webpage = self._download_webpage(url, video_id)
815 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
816 video = re.search(re_video, webpage)
818 raise ExtractorError(u'Unable to extract video')
819 video_url = video.group('video_url')
820 ext = video.group('ext')
822 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
823 webpage, u'thumbnail', fatal=False) # We pick the first poster
824 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
826 # The only place where you can get a title, it's not complete,
827 # but searching in other places doesn't work for all videos
828 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
829 webpage, u'title', flags=re.DOTALL)
831 return [{'id': video_id,
833 'title': video_title,
834 'thumbnail': video_thumbnail,
838 class BandcampIE(InfoExtractor):
839 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
841 def _real_extract(self, url):
842 mobj = re.match(self._VALID_URL, url)
843 title = mobj.group('title')
844 webpage = self._download_webpage(url, title)
845 # We get the link to the free download page
846 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
847 if m_download is None:
848 raise ExtractorError(u'No free songs found')
850 download_link = m_download.group(1)
851 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
852 webpage, re.MULTILINE|re.DOTALL).group('id')
854 download_webpage = self._download_webpage(download_link, id,
855 'Downloading free downloads page')
856 # We get the dictionary of the track from some javascrip code
857 info = re.search(r'items: (.*?),$',
858 download_webpage, re.MULTILINE).group(1)
859 info = json.loads(info)[0]
860 # We pick mp3-320 for now, until format selection can be easily implemented.
861 mp3_info = info[u'downloads'][u'mp3-320']
862 # If we try to use this url it says the link has expired
863 initial_url = mp3_info[u'url']
864 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
865 m_url = re.match(re_url, initial_url)
866 #We build the url we will use to get the final track url
867 # This url is build in Bandcamp in the script download_bunde_*.js
868 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
869 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
870 # If we could correctly generate the .rand field the url would be
871 #in the "download_url" key
872 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
874 track_info = {'id':id,
875 'title' : info[u'title'],
878 'thumbnail' : info[u'thumb_url'],
879 'uploader' : info[u'artist']
884 class RedTubeIE(InfoExtractor):
885 """Information Extractor for redtube"""
886 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
888 def _real_extract(self,url):
889 mobj = re.match(self._VALID_URL, url)
891 raise ExtractorError(u'Invalid URL: %s' % url)
893 video_id = mobj.group('id')
894 video_extension = 'mp4'
895 webpage = self._download_webpage(url, video_id)
897 self.report_extraction(video_id)
899 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
900 webpage, u'video URL')
902 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
908 'ext': video_extension,
909 'title': video_title,
912 class InaIE(InfoExtractor):
913 """Information Extractor for Ina.fr"""
914 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
916 def _real_extract(self,url):
917 mobj = re.match(self._VALID_URL, url)
919 video_id = mobj.group('id')
920 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
921 video_extension = 'mp4'
922 webpage = self._download_webpage(mrss_url, video_id)
924 self.report_extraction(video_id)
926 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
927 webpage, u'video URL')
929 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
935 'ext': video_extension,
936 'title': video_title,
939 class HowcastIE(InfoExtractor):
940 """Information Extractor for Howcast.com"""
941 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
943 def _real_extract(self, url):
944 mobj = re.match(self._VALID_URL, url)
946 video_id = mobj.group('id')
947 webpage_url = 'http://www.howcast.com/videos/' + video_id
948 webpage = self._download_webpage(webpage_url, video_id)
950 self.report_extraction(video_id)
952 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
953 webpage, u'video URL')
955 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
958 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
959 webpage, u'description', fatal=False)
961 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
962 webpage, u'thumbnail', fatal=False)
968 'title': video_title,
969 'description': video_description,
970 'thumbnail': thumbnail,
973 class VineIE(InfoExtractor):
974 """Information Extractor for Vine.co"""
975 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
977 def _real_extract(self, url):
978 mobj = re.match(self._VALID_URL, url)
980 video_id = mobj.group('id')
981 webpage_url = 'https://vine.co/v/' + video_id
982 webpage = self._download_webpage(webpage_url, video_id)
984 self.report_extraction(video_id)
986 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
987 webpage, u'video URL')
989 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
992 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
993 webpage, u'thumbnail', fatal=False)
995 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
996 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1002 'title': video_title,
1003 'thumbnail': thumbnail,
1004 'uploader': uploader,
1007 class FlickrIE(InfoExtractor):
1008 """Information Extractor for Flickr videos"""
1009 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1011 def _real_extract(self, url):
1012 mobj = re.match(self._VALID_URL, url)
1014 video_id = mobj.group('id')
1015 video_uploader_id = mobj.group('uploader_id')
1016 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1017 webpage = self._download_webpage(webpage_url, video_id)
1019 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1021 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1022 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1024 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1025 first_xml, u'node_id')
1027 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1028 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1030 self.report_extraction(video_id)
1032 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1034 raise ExtractorError(u'Unable to extract video url')
1035 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1037 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1038 webpage, u'video title')
1040 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1041 webpage, u'description', fatal=False)
1043 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1044 webpage, u'thumbnail', fatal=False)
1050 'title': video_title,
1051 'description': video_description,
1052 'thumbnail': thumbnail,
1053 'uploader_id': video_uploader_id,
1056 class TeamcocoIE(InfoExtractor):
1057 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1059 def _real_extract(self, url):
1060 mobj = re.match(self._VALID_URL, url)
1062 raise ExtractorError(u'Invalid URL: %s' % url)
1063 url_title = mobj.group('url_title')
1064 webpage = self._download_webpage(url, url_title)
1066 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1067 webpage, u'video id')
1069 self.report_extraction(video_id)
1071 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1074 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1075 webpage, u'thumbnail', fatal=False)
1077 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1078 webpage, u'description', fatal=False)
1080 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1081 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1083 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1090 'title': video_title,
1091 'thumbnail': thumbnail,
1092 'description': video_description,
1095 class XHamsterIE(InfoExtractor):
1096 """Information Extractor for xHamster"""
1097 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1099 def _real_extract(self,url):
1100 mobj = re.match(self._VALID_URL, url)
1102 video_id = mobj.group('id')
1103 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1104 webpage = self._download_webpage(mrss_url, video_id)
1106 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1108 raise ExtractorError(u'Unable to extract media URL')
1109 if len(mobj.group('server')) == 0:
1110 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1112 video_url = mobj.group('server')+'/key='+mobj.group('file')
1113 video_extension = video_url.split('.')[-1]
1115 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1118 # Can't see the description anywhere in the UI
1119 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1120 # webpage, u'description', fatal=False)
1121 # if video_description: video_description = unescapeHTML(video_description)
1123 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1125 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1127 video_upload_date = None
1128 self._downloader.report_warning(u'Unable to extract upload date')
1130 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1131 webpage, u'uploader id', default=u'anonymous')
1133 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1134 webpage, u'thumbnail', fatal=False)
1139 'ext': video_extension,
1140 'title': video_title,
1141 # 'description': video_description,
1142 'upload_date': video_upload_date,
1143 'uploader_id': video_uploader_id,
1144 'thumbnail': video_thumbnail
1147 class HypemIE(InfoExtractor):
1148 """Information Extractor for hypem"""
1149 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1151 def _real_extract(self, url):
1152 mobj = re.match(self._VALID_URL, url)
1154 raise ExtractorError(u'Invalid URL: %s' % url)
1155 track_id = mobj.group(1)
1157 data = { 'ax': 1, 'ts': time.time() }
1158 data_encoded = compat_urllib_parse.urlencode(data)
1159 complete_url = url + "?" + data_encoded
1160 request = compat_urllib_request.Request(complete_url)
1161 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1162 cookie = urlh.headers.get('Set-Cookie', '')
1164 self.report_extraction(track_id)
1166 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1167 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1169 track_list = json.loads(html_tracks)
1170 track = track_list[u'tracks'][0]
1172 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1175 track_id = track[u"id"]
1176 artist = track[u"artist"]
1177 title = track[u"song"]
1179 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1180 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1181 request.add_header('cookie', cookie)
1182 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1184 song_data = json.loads(song_data_json)
1186 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1187 final_url = song_data[u"url"]
1197 class Vbox7IE(InfoExtractor):
1198 """Information Extractor for Vbox7"""
1199 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1201 def _real_extract(self,url):
1202 mobj = re.match(self._VALID_URL, url)
1204 raise ExtractorError(u'Invalid URL: %s' % url)
1205 video_id = mobj.group(1)
1207 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1208 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1209 redirect_url = urlh.geturl() + new_location
1210 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1212 title = self._html_search_regex(r'<title>(.*)</title>',
1213 webpage, u'title').split('/')[0].strip()
1216 info_url = "http://vbox7.com/play/magare.do"
1217 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1218 info_request = compat_urllib_request.Request(info_url, data)
1219 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1220 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1221 if info_response is None:
1222 raise ExtractorError(u'Unable to extract the media url')
1223 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1230 'thumbnail': thumbnail_url,
1234 def gen_extractors():
1235 """ Return a list of an instance of every supported extractor.
1236 The order does matter; the first extractor matched is the one handling the URL.
1239 YoutubePlaylistIE(),
1264 StanfordOpenClassroomIE(),
1274 WorldStarHipHopIE(),
1304 def get_info_extractor(ie_name):
1305 """Returns the info extractor class with the given ie_name"""
1306 return globals()[ie_name+'IE']