10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.steam import SteamIE
45 from .extractor.ted import TEDIE
46 from .extractor.vimeo import VimeoIE
47 from .extractor.xnxx import XNXXIE
48 from .extractor.xvideos import XVideosIE
49 from .extractor.yahoo import YahooIE, YahooSearchIE
50 from .extractor.youku import YoukuIE
51 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
52 from .extractor.zdf import ZDFIE
66 class JustinTVIE(InfoExtractor):
67 """Information extractor for justin.tv and twitch.tv"""
68 # TODO: One broadcast may be split into multiple videos. The key
69 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
70 # starts at 1 and increases. Can we treat all parts as one video?
72 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
75 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
76 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
80 _JUSTIN_PAGE_LIMIT = 100
81 IE_NAME = u'justin.tv'
83 def report_download_page(self, channel, offset):
84 """Report attempt to download a single page of videos."""
85 self.to_screen(u'%s: Downloading video information from %d to %d' %
86 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
88 # Return count of items, list of *valid* items
89 def _parse_page(self, url, video_id):
90 webpage = self._download_webpage(url, video_id,
91 u'Downloading video info JSON',
92 u'unable to download video info JSON')
94 response = json.loads(webpage)
95 if type(response) != list:
96 error_text = response.get('error', 'unknown error')
97 raise ExtractorError(u'Justin.tv API: %s' % error_text)
100 video_url = clip['video_file_url']
102 video_extension = os.path.splitext(video_url)[1][1:]
103 video_date = re.sub('-', '', clip['start_time'][:10])
104 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
105 video_id = clip['id']
106 video_title = clip.get('title', video_id)
110 'title': video_title,
111 'uploader': clip.get('channel_name', video_uploader_id),
112 'uploader_id': video_uploader_id,
113 'upload_date': video_date,
114 'ext': video_extension,
116 return (len(response), info)
118 def _real_extract(self, url):
119 mobj = re.match(self._VALID_URL, url)
121 raise ExtractorError(u'invalid URL: %s' % url)
123 api_base = 'http://api.justin.tv'
125 if mobj.group('channelid'):
127 video_id = mobj.group('channelid')
128 api = api_base + '/channel/archives/%s.json' % video_id
129 elif mobj.group('chapterid'):
130 chapter_id = mobj.group('chapterid')
132 webpage = self._download_webpage(url, chapter_id)
133 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
135 raise ExtractorError(u'Cannot find archive of a chapter')
136 archive_id = m.group(1)
138 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
139 chapter_info_xml = self._download_webpage(api, chapter_id,
140 note=u'Downloading chapter information',
141 errnote=u'Chapter information download failed')
142 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
143 for a in doc.findall('.//archive'):
144 if archive_id == a.find('./id').text:
147 raise ExtractorError(u'Could not find chapter in chapter information')
149 video_url = a.find('./video_file_url').text
150 video_ext = video_url.rpartition('.')[2] or u'flv'
152 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
153 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
154 note='Downloading chapter metadata',
155 errnote='Download of chapter metadata failed')
156 chapter_info = json.loads(chapter_info_json)
158 bracket_start = int(doc.find('.//bracket_start').text)
159 bracket_end = int(doc.find('.//bracket_end').text)
161 # TODO determine start (and probably fix up file)
162 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
163 #video_url += u'?start=' + TODO:start_timestamp
164 # bracket_start is 13290, but we want 51670615
165 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
166 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
169 'id': u'c' + chapter_id,
172 'title': chapter_info['title'],
173 'thumbnail': chapter_info['preview'],
174 'description': chapter_info['description'],
175 'uploader': chapter_info['channel']['display_name'],
176 'uploader_id': chapter_info['channel']['name'],
180 video_id = mobj.group('videoid')
181 api = api_base + '/broadcast/by_archive/%s.json' % video_id
183 self.report_extraction(video_id)
187 limit = self._JUSTIN_PAGE_LIMIT
190 self.report_download_page(video_id, offset)
191 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
192 page_count, page_info = self._parse_page(page_url, video_id)
193 info.extend(page_info)
194 if not paged or page_count != limit:
199 class FunnyOrDieIE(InfoExtractor):
200 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
202 def _real_extract(self, url):
203 mobj = re.match(self._VALID_URL, url)
205 raise ExtractorError(u'invalid URL: %s' % url)
207 video_id = mobj.group('id')
208 webpage = self._download_webpage(url, video_id)
210 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
211 webpage, u'video URL', flags=re.DOTALL)
213 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
214 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
216 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
217 webpage, u'description', fatal=False, flags=re.DOTALL)
224 'description': video_description,
229 class UstreamIE(InfoExtractor):
230 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
233 def _real_extract(self, url):
234 m = re.match(self._VALID_URL, url)
235 video_id = m.group('videoID')
237 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
238 webpage = self._download_webpage(url, video_id)
240 self.report_extraction(video_id)
242 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
245 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
246 webpage, u'uploader', fatal=False, flags=re.DOTALL)
248 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
249 webpage, u'thumbnail', fatal=False)
255 'title': video_title,
256 'uploader': uploader,
257 'thumbnail': thumbnail,
261 class WorldStarHipHopIE(InfoExtractor):
262 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
263 IE_NAME = u'WorldStarHipHop'
265 def _real_extract(self, url):
266 m = re.match(self._VALID_URL, url)
267 video_id = m.group('id')
269 webpage_src = self._download_webpage(url, video_id)
271 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
272 webpage_src, u'video URL')
274 if 'mp4' in video_url:
279 video_title = self._html_search_regex(r"<title>(.*)</title>",
280 webpage_src, u'title')
282 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
283 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
284 webpage_src, u'thumbnail', fatal=False)
287 _title = r"""candytitles.*>(.*)</span>"""
288 mobj = re.search(_title, webpage_src)
290 video_title = mobj.group(1)
295 'title' : video_title,
296 'thumbnail' : thumbnail,
301 class RBMARadioIE(InfoExtractor):
302 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
304 def _real_extract(self, url):
305 m = re.match(self._VALID_URL, url)
306 video_id = m.group('videoID')
308 webpage = self._download_webpage(url, video_id)
310 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
311 webpage, u'json data', flags=re.MULTILINE)
314 data = json.loads(json_data)
315 except ValueError as e:
316 raise ExtractorError(u'Invalid JSON: ' + str(e))
318 video_url = data['akamai_url'] + '&cbr=256'
319 url_parts = compat_urllib_parse_urlparse(video_url)
320 video_ext = url_parts.path.rpartition('.')[2]
325 'title': data['title'],
326 'description': data.get('teaser_text'),
327 'location': data.get('country_of_origin'),
328 'uploader': data.get('host', {}).get('name'),
329 'uploader_id': data.get('host', {}).get('slug'),
330 'thumbnail': data.get('image', {}).get('large_url_2x'),
331 'duration': data.get('duration'),
336 class YouPornIE(InfoExtractor):
337 """Information extractor for youporn.com."""
338 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
340 def _print_formats(self, formats):
341 """Print all available formats"""
342 print(u'Available formats:')
343 print(u'ext\t\tformat')
344 print(u'---------------------------------')
345 for format in formats:
346 print(u'%s\t\t%s' % (format['ext'], format['format']))
348 def _specific(self, req_format, formats):
350 if(x["format"]==req_format):
354 def _real_extract(self, url):
355 mobj = re.match(self._VALID_URL, url)
357 raise ExtractorError(u'Invalid URL: %s' % url)
358 video_id = mobj.group('videoid')
360 req = compat_urllib_request.Request(url)
361 req.add_header('Cookie', 'age_verified=1')
362 webpage = self._download_webpage(req, video_id)
364 # Get JSON parameters
365 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
367 params = json.loads(json_params)
369 raise ExtractorError(u'Invalid JSON')
371 self.report_extraction(video_id)
373 video_title = params['title']
374 upload_date = unified_strdate(params['release_date_f'])
375 video_description = params['description']
376 video_uploader = params['submitted_by']
377 thumbnail = params['thumbnails'][0]['image']
379 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
381 # Get all of the formats available
382 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
383 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
384 webpage, u'download list').strip()
386 # Get all of the links from the page
387 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
388 links = re.findall(LINK_RE, download_list_html)
390 raise ExtractorError(u'ERROR: no known formats available for video')
392 self.to_screen(u'Links found: %d' % len(links))
397 # A link looks like this:
398 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
399 # A path looks like this:
400 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
401 video_url = unescapeHTML( link )
402 path = compat_urllib_parse_urlparse( video_url ).path
403 extension = os.path.splitext( path )[1][1:]
404 format = path.split('/')[4].split('_')[:2]
407 format = "-".join( format )
408 # title = u'%s-%s-%s' % (video_title, size, bitrate)
413 'uploader': video_uploader,
414 'upload_date': upload_date,
415 'title': video_title,
418 'thumbnail': thumbnail,
419 'description': video_description
422 if self._downloader.params.get('listformats', None):
423 self._print_formats(formats)
426 req_format = self._downloader.params.get('format', None)
427 self.to_screen(u'Format: %s' % req_format)
429 if req_format is None or req_format == 'best':
431 elif req_format == 'worst':
433 elif req_format in ('-1', 'all'):
436 format = self._specific( req_format, formats )
438 raise ExtractorError(u'Requested format not available')
443 class PornotubeIE(InfoExtractor):
444 """Information extractor for pornotube.com."""
445 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
447 def _real_extract(self, url):
448 mobj = re.match(self._VALID_URL, url)
450 raise ExtractorError(u'Invalid URL: %s' % url)
452 video_id = mobj.group('videoid')
453 video_title = mobj.group('title')
455 # Get webpage content
456 webpage = self._download_webpage(url, video_id)
459 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
460 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
461 video_url = compat_urllib_parse.unquote(video_url)
463 #Get the uploaded date
464 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
465 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
466 if upload_date: upload_date = unified_strdate(upload_date)
468 info = {'id': video_id,
471 'upload_date': upload_date,
472 'title': video_title,
478 class YouJizzIE(InfoExtractor):
479 """Information extractor for youjizz.com."""
480 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
482 def _real_extract(self, url):
483 mobj = re.match(self._VALID_URL, url)
485 raise ExtractorError(u'Invalid URL: %s' % url)
487 video_id = mobj.group('videoid')
489 # Get webpage content
490 webpage = self._download_webpage(url, video_id)
492 # Get the video title
493 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
494 webpage, u'title').strip()
497 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
499 raise ExtractorError(u'ERROR: unable to extract embed page')
501 embed_page_url = result.group(0).strip()
502 video_id = result.group('videoid')
504 webpage = self._download_webpage(embed_page_url, video_id)
507 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
508 webpage, u'video URL')
510 info = {'id': video_id,
512 'title': video_title,
515 'player_url': embed_page_url}
519 class EightTracksIE(InfoExtractor):
521 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
523 def _real_extract(self, url):
524 mobj = re.match(self._VALID_URL, url)
526 raise ExtractorError(u'Invalid URL: %s' % url)
527 playlist_id = mobj.group('id')
529 webpage = self._download_webpage(url, playlist_id)
531 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
532 data = json.loads(json_like)
534 session = str(random.randint(0, 1000000000))
536 track_count = data['tracks_count']
537 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
540 for i in itertools.count():
541 api_json = self._download_webpage(next_url, playlist_id,
542 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
543 errnote=u'Failed to download song information')
544 api_data = json.loads(api_json)
545 track_data = api_data[u'set']['track']
547 'id': track_data['id'],
548 'url': track_data['track_file_stream_url'],
549 'title': track_data['performer'] + u' - ' + track_data['name'],
550 'raw_title': track_data['name'],
551 'uploader_id': data['user']['login'],
555 if api_data['set']['at_last_track']:
557 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
560 class KeekIE(InfoExtractor):
561 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
564 def _real_extract(self, url):
565 m = re.match(self._VALID_URL, url)
566 video_id = m.group('videoID')
568 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
569 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
570 webpage = self._download_webpage(url, video_id)
572 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
575 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
576 webpage, u'uploader', fatal=False)
582 'title': video_title,
583 'thumbnail': thumbnail,
589 class MySpassIE(InfoExtractor):
590 _VALID_URL = r'http://www.myspass.de/.*'
592 def _real_extract(self, url):
593 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
595 # video id is the last path element of the URL
596 # usually there is a trailing slash, so also try the second but last
597 url_path = compat_urllib_parse_urlparse(url).path
598 url_parent_path, video_id = os.path.split(url_path)
600 _, video_id = os.path.split(url_parent_path)
603 metadata_url = META_DATA_URL_TEMPLATE % video_id
604 metadata_text = self._download_webpage(metadata_url, video_id)
605 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
607 # extract values from metadata
608 url_flv_el = metadata.find('url_flv')
609 if url_flv_el is None:
610 raise ExtractorError(u'Unable to extract download url')
611 video_url = url_flv_el.text
612 extension = os.path.splitext(video_url)[1][1:]
613 title_el = metadata.find('title')
615 raise ExtractorError(u'Unable to extract title')
616 title = title_el.text
617 format_id_el = metadata.find('format_id')
618 if format_id_el is None:
621 format = format_id_el.text
622 description_el = metadata.find('description')
623 if description_el is not None:
624 description = description_el.text
627 imagePreview_el = metadata.find('imagePreview')
628 if imagePreview_el is not None:
629 thumbnail = imagePreview_el.text
638 'thumbnail': thumbnail,
639 'description': description
643 class SpiegelIE(InfoExtractor):
644 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
646 def _real_extract(self, url):
647 m = re.match(self._VALID_URL, url)
648 video_id = m.group('videoID')
650 webpage = self._download_webpage(url, video_id)
652 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
655 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
656 xml_code = self._download_webpage(xml_url, video_id,
657 note=u'Downloading XML', errnote=u'Failed to download XML')
659 idoc = xml.etree.ElementTree.fromstring(xml_code)
661 filename = last_type.findall('./filename')[0].text
662 duration = float(last_type.findall('./duration')[0].text)
664 video_url = 'http://video2.spiegel.de/flash/' + filename
665 video_ext = filename.rpartition('.')[2]
670 'title': video_title,
671 'duration': duration,
675 class LiveLeakIE(InfoExtractor):
677 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
678 IE_NAME = u'liveleak'
680 def _real_extract(self, url):
681 mobj = re.match(self._VALID_URL, url)
683 raise ExtractorError(u'Invalid URL: %s' % url)
685 video_id = mobj.group('video_id')
687 webpage = self._download_webpage(url, video_id)
689 video_url = self._search_regex(r'file: "(.*?)",',
690 webpage, u'video URL')
692 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
693 webpage, u'title').replace('LiveLeak.com -', '').strip()
695 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
696 webpage, u'description', fatal=False)
698 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
699 webpage, u'uploader', fatal=False)
705 'title': video_title,
706 'description': video_description,
707 'uploader': video_uploader
714 class TumblrIE(InfoExtractor):
715 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
717 def _real_extract(self, url):
718 m_url = re.match(self._VALID_URL, url)
719 video_id = m_url.group('id')
720 blog = m_url.group('blog_name')
722 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
723 webpage = self._download_webpage(url, video_id)
725 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
726 video = re.search(re_video, webpage)
728 raise ExtractorError(u'Unable to extract video')
729 video_url = video.group('video_url')
730 ext = video.group('ext')
732 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
733 webpage, u'thumbnail', fatal=False) # We pick the first poster
734 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
736 # The only place where you can get a title, it's not complete,
737 # but searching in other places doesn't work for all videos
738 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
739 webpage, u'title', flags=re.DOTALL)
741 return [{'id': video_id,
743 'title': video_title,
744 'thumbnail': video_thumbnail,
748 class BandcampIE(InfoExtractor):
749 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
751 def _real_extract(self, url):
752 mobj = re.match(self._VALID_URL, url)
753 title = mobj.group('title')
754 webpage = self._download_webpage(url, title)
755 # We get the link to the free download page
756 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
757 if m_download is None:
758 raise ExtractorError(u'No free songs found')
760 download_link = m_download.group(1)
761 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
762 webpage, re.MULTILINE|re.DOTALL).group('id')
764 download_webpage = self._download_webpage(download_link, id,
765 'Downloading free downloads page')
766 # We get the dictionary of the track from some javascrip code
767 info = re.search(r'items: (.*?),$',
768 download_webpage, re.MULTILINE).group(1)
769 info = json.loads(info)[0]
770 # We pick mp3-320 for now, until format selection can be easily implemented.
771 mp3_info = info[u'downloads'][u'mp3-320']
772 # If we try to use this url it says the link has expired
773 initial_url = mp3_info[u'url']
774 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
775 m_url = re.match(re_url, initial_url)
776 #We build the url we will use to get the final track url
777 # This url is build in Bandcamp in the script download_bunde_*.js
778 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
779 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
780 # If we could correctly generate the .rand field the url would be
781 #in the "download_url" key
782 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
784 track_info = {'id':id,
785 'title' : info[u'title'],
788 'thumbnail' : info[u'thumb_url'],
789 'uploader' : info[u'artist']
794 class RedTubeIE(InfoExtractor):
795 """Information Extractor for redtube"""
796 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
798 def _real_extract(self,url):
799 mobj = re.match(self._VALID_URL, url)
801 raise ExtractorError(u'Invalid URL: %s' % url)
803 video_id = mobj.group('id')
804 video_extension = 'mp4'
805 webpage = self._download_webpage(url, video_id)
807 self.report_extraction(video_id)
809 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
810 webpage, u'video URL')
812 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
818 'ext': video_extension,
819 'title': video_title,
822 class InaIE(InfoExtractor):
823 """Information Extractor for Ina.fr"""
824 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
826 def _real_extract(self,url):
827 mobj = re.match(self._VALID_URL, url)
829 video_id = mobj.group('id')
830 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
831 video_extension = 'mp4'
832 webpage = self._download_webpage(mrss_url, video_id)
834 self.report_extraction(video_id)
836 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
837 webpage, u'video URL')
839 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
845 'ext': video_extension,
846 'title': video_title,
849 class HowcastIE(InfoExtractor):
850 """Information Extractor for Howcast.com"""
851 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
853 def _real_extract(self, url):
854 mobj = re.match(self._VALID_URL, url)
856 video_id = mobj.group('id')
857 webpage_url = 'http://www.howcast.com/videos/' + video_id
858 webpage = self._download_webpage(webpage_url, video_id)
860 self.report_extraction(video_id)
862 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
863 webpage, u'video URL')
865 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
868 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
869 webpage, u'description', fatal=False)
871 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
872 webpage, u'thumbnail', fatal=False)
878 'title': video_title,
879 'description': video_description,
880 'thumbnail': thumbnail,
883 class VineIE(InfoExtractor):
884 """Information Extractor for Vine.co"""
885 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
887 def _real_extract(self, url):
888 mobj = re.match(self._VALID_URL, url)
890 video_id = mobj.group('id')
891 webpage_url = 'https://vine.co/v/' + video_id
892 webpage = self._download_webpage(webpage_url, video_id)
894 self.report_extraction(video_id)
896 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
897 webpage, u'video URL')
899 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
902 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
903 webpage, u'thumbnail', fatal=False)
905 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
906 webpage, u'uploader', fatal=False, flags=re.DOTALL)
912 'title': video_title,
913 'thumbnail': thumbnail,
914 'uploader': uploader,
917 class FlickrIE(InfoExtractor):
918 """Information Extractor for Flickr videos"""
919 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
921 def _real_extract(self, url):
922 mobj = re.match(self._VALID_URL, url)
924 video_id = mobj.group('id')
925 video_uploader_id = mobj.group('uploader_id')
926 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
927 webpage = self._download_webpage(webpage_url, video_id)
929 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
931 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
932 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
934 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
935 first_xml, u'node_id')
937 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
938 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
940 self.report_extraction(video_id)
942 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
944 raise ExtractorError(u'Unable to extract video url')
945 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
947 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
948 webpage, u'video title')
950 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
951 webpage, u'description', fatal=False)
953 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
954 webpage, u'thumbnail', fatal=False)
960 'title': video_title,
961 'description': video_description,
962 'thumbnail': thumbnail,
963 'uploader_id': video_uploader_id,
966 class TeamcocoIE(InfoExtractor):
967 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
969 def _real_extract(self, url):
970 mobj = re.match(self._VALID_URL, url)
972 raise ExtractorError(u'Invalid URL: %s' % url)
973 url_title = mobj.group('url_title')
974 webpage = self._download_webpage(url, url_title)
976 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
977 webpage, u'video id')
979 self.report_extraction(video_id)
981 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
984 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
985 webpage, u'thumbnail', fatal=False)
987 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
988 webpage, u'description', fatal=False)
990 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
991 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
993 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1000 'title': video_title,
1001 'thumbnail': thumbnail,
1002 'description': video_description,
1005 class XHamsterIE(InfoExtractor):
1006 """Information Extractor for xHamster"""
1007 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1009 def _real_extract(self,url):
1010 mobj = re.match(self._VALID_URL, url)
1012 video_id = mobj.group('id')
1013 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1014 webpage = self._download_webpage(mrss_url, video_id)
1016 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1018 raise ExtractorError(u'Unable to extract media URL')
1019 if len(mobj.group('server')) == 0:
1020 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1022 video_url = mobj.group('server')+'/key='+mobj.group('file')
1023 video_extension = video_url.split('.')[-1]
1025 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1028 # Can't see the description anywhere in the UI
1029 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1030 # webpage, u'description', fatal=False)
1031 # if video_description: video_description = unescapeHTML(video_description)
1033 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1035 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1037 video_upload_date = None
1038 self._downloader.report_warning(u'Unable to extract upload date')
1040 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1041 webpage, u'uploader id', default=u'anonymous')
1043 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1044 webpage, u'thumbnail', fatal=False)
1049 'ext': video_extension,
1050 'title': video_title,
1051 # 'description': video_description,
1052 'upload_date': video_upload_date,
1053 'uploader_id': video_uploader_id,
1054 'thumbnail': video_thumbnail
1057 class HypemIE(InfoExtractor):
1058 """Information Extractor for hypem"""
1059 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1061 def _real_extract(self, url):
1062 mobj = re.match(self._VALID_URL, url)
1064 raise ExtractorError(u'Invalid URL: %s' % url)
1065 track_id = mobj.group(1)
1067 data = { 'ax': 1, 'ts': time.time() }
1068 data_encoded = compat_urllib_parse.urlencode(data)
1069 complete_url = url + "?" + data_encoded
1070 request = compat_urllib_request.Request(complete_url)
1071 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1072 cookie = urlh.headers.get('Set-Cookie', '')
1074 self.report_extraction(track_id)
1076 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1077 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1079 track_list = json.loads(html_tracks)
1080 track = track_list[u'tracks'][0]
1082 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1085 track_id = track[u"id"]
1086 artist = track[u"artist"]
1087 title = track[u"song"]
1089 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1090 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1091 request.add_header('cookie', cookie)
1092 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1094 song_data = json.loads(song_data_json)
1096 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1097 final_url = song_data[u"url"]
1107 class Vbox7IE(InfoExtractor):
1108 """Information Extractor for Vbox7"""
1109 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1111 def _real_extract(self,url):
1112 mobj = re.match(self._VALID_URL, url)
1114 raise ExtractorError(u'Invalid URL: %s' % url)
1115 video_id = mobj.group(1)
1117 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1118 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1119 redirect_url = urlh.geturl() + new_location
1120 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1122 title = self._html_search_regex(r'<title>(.*)</title>',
1123 webpage, u'title').split('/')[0].strip()
1126 info_url = "http://vbox7.com/play/magare.do"
1127 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1128 info_request = compat_urllib_request.Request(info_url, data)
1129 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1130 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1131 if info_response is None:
1132 raise ExtractorError(u'Unable to extract the media url')
1133 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1140 'thumbnail': thumbnail_url,
1144 def gen_extractors():
1145 """ Return a list of an instance of every supported extractor.
1146 The order does matter; the first extractor matched is the one handling the URL.
1149 YoutubePlaylistIE(),
1174 StanfordOpenClassroomIE(),
1184 WorldStarHipHopIE(),
1214 def get_info_extractor(ie_name):
1215 """Returns the info extractor class with the given ie_name"""
1216 return globals()[ie_name+'IE']