10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.steam import SteamIE
45 from .extractor.ted import TEDIE
46 from .extractor.vimeo import VimeoIE
47 from .extractor.worldstarhiphop import WorldStarHipHopIE
48 from .extractor.xnxx import XNXXIE
49 from .extractor.xvideos import XVideosIE
50 from .extractor.yahoo import YahooIE, YahooSearchIE
51 from .extractor.youku import YoukuIE
52 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
53 from .extractor.zdf import ZDFIE
67 class JustinTVIE(InfoExtractor):
68 """Information extractor for justin.tv and twitch.tv"""
69 # TODO: One broadcast may be split into multiple videos. The key
70 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
71 # starts at 1 and increases. Can we treat all parts as one video?
73 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
76 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
77 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
81 _JUSTIN_PAGE_LIMIT = 100
82 IE_NAME = u'justin.tv'
84 def report_download_page(self, channel, offset):
85 """Report attempt to download a single page of videos."""
86 self.to_screen(u'%s: Downloading video information from %d to %d' %
87 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
89 # Return count of items, list of *valid* items
90 def _parse_page(self, url, video_id):
91 webpage = self._download_webpage(url, video_id,
92 u'Downloading video info JSON',
93 u'unable to download video info JSON')
95 response = json.loads(webpage)
96 if type(response) != list:
97 error_text = response.get('error', 'unknown error')
98 raise ExtractorError(u'Justin.tv API: %s' % error_text)
100 for clip in response:
101 video_url = clip['video_file_url']
103 video_extension = os.path.splitext(video_url)[1][1:]
104 video_date = re.sub('-', '', clip['start_time'][:10])
105 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
106 video_id = clip['id']
107 video_title = clip.get('title', video_id)
111 'title': video_title,
112 'uploader': clip.get('channel_name', video_uploader_id),
113 'uploader_id': video_uploader_id,
114 'upload_date': video_date,
115 'ext': video_extension,
117 return (len(response), info)
119 def _real_extract(self, url):
120 mobj = re.match(self._VALID_URL, url)
122 raise ExtractorError(u'invalid URL: %s' % url)
124 api_base = 'http://api.justin.tv'
126 if mobj.group('channelid'):
128 video_id = mobj.group('channelid')
129 api = api_base + '/channel/archives/%s.json' % video_id
130 elif mobj.group('chapterid'):
131 chapter_id = mobj.group('chapterid')
133 webpage = self._download_webpage(url, chapter_id)
134 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
136 raise ExtractorError(u'Cannot find archive of a chapter')
137 archive_id = m.group(1)
139 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
140 chapter_info_xml = self._download_webpage(api, chapter_id,
141 note=u'Downloading chapter information',
142 errnote=u'Chapter information download failed')
143 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
144 for a in doc.findall('.//archive'):
145 if archive_id == a.find('./id').text:
148 raise ExtractorError(u'Could not find chapter in chapter information')
150 video_url = a.find('./video_file_url').text
151 video_ext = video_url.rpartition('.')[2] or u'flv'
153 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
154 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
155 note='Downloading chapter metadata',
156 errnote='Download of chapter metadata failed')
157 chapter_info = json.loads(chapter_info_json)
159 bracket_start = int(doc.find('.//bracket_start').text)
160 bracket_end = int(doc.find('.//bracket_end').text)
162 # TODO determine start (and probably fix up file)
163 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
164 #video_url += u'?start=' + TODO:start_timestamp
165 # bracket_start is 13290, but we want 51670615
166 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
167 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
170 'id': u'c' + chapter_id,
173 'title': chapter_info['title'],
174 'thumbnail': chapter_info['preview'],
175 'description': chapter_info['description'],
176 'uploader': chapter_info['channel']['display_name'],
177 'uploader_id': chapter_info['channel']['name'],
181 video_id = mobj.group('videoid')
182 api = api_base + '/broadcast/by_archive/%s.json' % video_id
184 self.report_extraction(video_id)
188 limit = self._JUSTIN_PAGE_LIMIT
191 self.report_download_page(video_id, offset)
192 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
193 page_count, page_info = self._parse_page(page_url, video_id)
194 info.extend(page_info)
195 if not paged or page_count != limit:
200 class FunnyOrDieIE(InfoExtractor):
201 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
203 def _real_extract(self, url):
204 mobj = re.match(self._VALID_URL, url)
206 raise ExtractorError(u'invalid URL: %s' % url)
208 video_id = mobj.group('id')
209 webpage = self._download_webpage(url, video_id)
211 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
212 webpage, u'video URL', flags=re.DOTALL)
214 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
215 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
217 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
218 webpage, u'description', fatal=False, flags=re.DOTALL)
225 'description': video_description,
230 class UstreamIE(InfoExtractor):
231 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
234 def _real_extract(self, url):
235 m = re.match(self._VALID_URL, url)
236 video_id = m.group('videoID')
238 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
239 webpage = self._download_webpage(url, video_id)
241 self.report_extraction(video_id)
243 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
246 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
247 webpage, u'uploader', fatal=False, flags=re.DOTALL)
249 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
250 webpage, u'thumbnail', fatal=False)
256 'title': video_title,
257 'uploader': uploader,
258 'thumbnail': thumbnail,
263 class RBMARadioIE(InfoExtractor):
264 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
266 def _real_extract(self, url):
267 m = re.match(self._VALID_URL, url)
268 video_id = m.group('videoID')
270 webpage = self._download_webpage(url, video_id)
272 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
273 webpage, u'json data', flags=re.MULTILINE)
276 data = json.loads(json_data)
277 except ValueError as e:
278 raise ExtractorError(u'Invalid JSON: ' + str(e))
280 video_url = data['akamai_url'] + '&cbr=256'
281 url_parts = compat_urllib_parse_urlparse(video_url)
282 video_ext = url_parts.path.rpartition('.')[2]
287 'title': data['title'],
288 'description': data.get('teaser_text'),
289 'location': data.get('country_of_origin'),
290 'uploader': data.get('host', {}).get('name'),
291 'uploader_id': data.get('host', {}).get('slug'),
292 'thumbnail': data.get('image', {}).get('large_url_2x'),
293 'duration': data.get('duration'),
298 class YouPornIE(InfoExtractor):
299 """Information extractor for youporn.com."""
300 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
302 def _print_formats(self, formats):
303 """Print all available formats"""
304 print(u'Available formats:')
305 print(u'ext\t\tformat')
306 print(u'---------------------------------')
307 for format in formats:
308 print(u'%s\t\t%s' % (format['ext'], format['format']))
310 def _specific(self, req_format, formats):
312 if(x["format"]==req_format):
316 def _real_extract(self, url):
317 mobj = re.match(self._VALID_URL, url)
319 raise ExtractorError(u'Invalid URL: %s' % url)
320 video_id = mobj.group('videoid')
322 req = compat_urllib_request.Request(url)
323 req.add_header('Cookie', 'age_verified=1')
324 webpage = self._download_webpage(req, video_id)
326 # Get JSON parameters
327 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
329 params = json.loads(json_params)
331 raise ExtractorError(u'Invalid JSON')
333 self.report_extraction(video_id)
335 video_title = params['title']
336 upload_date = unified_strdate(params['release_date_f'])
337 video_description = params['description']
338 video_uploader = params['submitted_by']
339 thumbnail = params['thumbnails'][0]['image']
341 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
343 # Get all of the formats available
344 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
345 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
346 webpage, u'download list').strip()
348 # Get all of the links from the page
349 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
350 links = re.findall(LINK_RE, download_list_html)
352 raise ExtractorError(u'ERROR: no known formats available for video')
354 self.to_screen(u'Links found: %d' % len(links))
359 # A link looks like this:
360 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
361 # A path looks like this:
362 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
363 video_url = unescapeHTML( link )
364 path = compat_urllib_parse_urlparse( video_url ).path
365 extension = os.path.splitext( path )[1][1:]
366 format = path.split('/')[4].split('_')[:2]
369 format = "-".join( format )
370 # title = u'%s-%s-%s' % (video_title, size, bitrate)
375 'uploader': video_uploader,
376 'upload_date': upload_date,
377 'title': video_title,
380 'thumbnail': thumbnail,
381 'description': video_description
384 if self._downloader.params.get('listformats', None):
385 self._print_formats(formats)
388 req_format = self._downloader.params.get('format', None)
389 self.to_screen(u'Format: %s' % req_format)
391 if req_format is None or req_format == 'best':
393 elif req_format == 'worst':
395 elif req_format in ('-1', 'all'):
398 format = self._specific( req_format, formats )
400 raise ExtractorError(u'Requested format not available')
405 class PornotubeIE(InfoExtractor):
406 """Information extractor for pornotube.com."""
407 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
409 def _real_extract(self, url):
410 mobj = re.match(self._VALID_URL, url)
412 raise ExtractorError(u'Invalid URL: %s' % url)
414 video_id = mobj.group('videoid')
415 video_title = mobj.group('title')
417 # Get webpage content
418 webpage = self._download_webpage(url, video_id)
421 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
422 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
423 video_url = compat_urllib_parse.unquote(video_url)
425 #Get the uploaded date
426 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
427 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
428 if upload_date: upload_date = unified_strdate(upload_date)
430 info = {'id': video_id,
433 'upload_date': upload_date,
434 'title': video_title,
440 class YouJizzIE(InfoExtractor):
441 """Information extractor for youjizz.com."""
442 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
444 def _real_extract(self, url):
445 mobj = re.match(self._VALID_URL, url)
447 raise ExtractorError(u'Invalid URL: %s' % url)
449 video_id = mobj.group('videoid')
451 # Get webpage content
452 webpage = self._download_webpage(url, video_id)
454 # Get the video title
455 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
456 webpage, u'title').strip()
459 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
461 raise ExtractorError(u'ERROR: unable to extract embed page')
463 embed_page_url = result.group(0).strip()
464 video_id = result.group('videoid')
466 webpage = self._download_webpage(embed_page_url, video_id)
469 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
470 webpage, u'video URL')
472 info = {'id': video_id,
474 'title': video_title,
477 'player_url': embed_page_url}
481 class EightTracksIE(InfoExtractor):
483 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
485 def _real_extract(self, url):
486 mobj = re.match(self._VALID_URL, url)
488 raise ExtractorError(u'Invalid URL: %s' % url)
489 playlist_id = mobj.group('id')
491 webpage = self._download_webpage(url, playlist_id)
493 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
494 data = json.loads(json_like)
496 session = str(random.randint(0, 1000000000))
498 track_count = data['tracks_count']
499 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
502 for i in itertools.count():
503 api_json = self._download_webpage(next_url, playlist_id,
504 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
505 errnote=u'Failed to download song information')
506 api_data = json.loads(api_json)
507 track_data = api_data[u'set']['track']
509 'id': track_data['id'],
510 'url': track_data['track_file_stream_url'],
511 'title': track_data['performer'] + u' - ' + track_data['name'],
512 'raw_title': track_data['name'],
513 'uploader_id': data['user']['login'],
517 if api_data['set']['at_last_track']:
519 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
522 class KeekIE(InfoExtractor):
523 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
526 def _real_extract(self, url):
527 m = re.match(self._VALID_URL, url)
528 video_id = m.group('videoID')
530 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
531 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
532 webpage = self._download_webpage(url, video_id)
534 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
537 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
538 webpage, u'uploader', fatal=False)
544 'title': video_title,
545 'thumbnail': thumbnail,
551 class MySpassIE(InfoExtractor):
552 _VALID_URL = r'http://www.myspass.de/.*'
554 def _real_extract(self, url):
555 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
557 # video id is the last path element of the URL
558 # usually there is a trailing slash, so also try the second but last
559 url_path = compat_urllib_parse_urlparse(url).path
560 url_parent_path, video_id = os.path.split(url_path)
562 _, video_id = os.path.split(url_parent_path)
565 metadata_url = META_DATA_URL_TEMPLATE % video_id
566 metadata_text = self._download_webpage(metadata_url, video_id)
567 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
569 # extract values from metadata
570 url_flv_el = metadata.find('url_flv')
571 if url_flv_el is None:
572 raise ExtractorError(u'Unable to extract download url')
573 video_url = url_flv_el.text
574 extension = os.path.splitext(video_url)[1][1:]
575 title_el = metadata.find('title')
577 raise ExtractorError(u'Unable to extract title')
578 title = title_el.text
579 format_id_el = metadata.find('format_id')
580 if format_id_el is None:
583 format = format_id_el.text
584 description_el = metadata.find('description')
585 if description_el is not None:
586 description = description_el.text
589 imagePreview_el = metadata.find('imagePreview')
590 if imagePreview_el is not None:
591 thumbnail = imagePreview_el.text
600 'thumbnail': thumbnail,
601 'description': description
605 class SpiegelIE(InfoExtractor):
606 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
608 def _real_extract(self, url):
609 m = re.match(self._VALID_URL, url)
610 video_id = m.group('videoID')
612 webpage = self._download_webpage(url, video_id)
614 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
617 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
618 xml_code = self._download_webpage(xml_url, video_id,
619 note=u'Downloading XML', errnote=u'Failed to download XML')
621 idoc = xml.etree.ElementTree.fromstring(xml_code)
623 filename = last_type.findall('./filename')[0].text
624 duration = float(last_type.findall('./duration')[0].text)
626 video_url = 'http://video2.spiegel.de/flash/' + filename
627 video_ext = filename.rpartition('.')[2]
632 'title': video_title,
633 'duration': duration,
637 class LiveLeakIE(InfoExtractor):
639 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
640 IE_NAME = u'liveleak'
642 def _real_extract(self, url):
643 mobj = re.match(self._VALID_URL, url)
645 raise ExtractorError(u'Invalid URL: %s' % url)
647 video_id = mobj.group('video_id')
649 webpage = self._download_webpage(url, video_id)
651 video_url = self._search_regex(r'file: "(.*?)",',
652 webpage, u'video URL')
654 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
655 webpage, u'title').replace('LiveLeak.com -', '').strip()
657 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
658 webpage, u'description', fatal=False)
660 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
661 webpage, u'uploader', fatal=False)
667 'title': video_title,
668 'description': video_description,
669 'uploader': video_uploader
676 class TumblrIE(InfoExtractor):
677 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
679 def _real_extract(self, url):
680 m_url = re.match(self._VALID_URL, url)
681 video_id = m_url.group('id')
682 blog = m_url.group('blog_name')
684 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
685 webpage = self._download_webpage(url, video_id)
687 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
688 video = re.search(re_video, webpage)
690 raise ExtractorError(u'Unable to extract video')
691 video_url = video.group('video_url')
692 ext = video.group('ext')
694 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
695 webpage, u'thumbnail', fatal=False) # We pick the first poster
696 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
698 # The only place where you can get a title, it's not complete,
699 # but searching in other places doesn't work for all videos
700 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
701 webpage, u'title', flags=re.DOTALL)
703 return [{'id': video_id,
705 'title': video_title,
706 'thumbnail': video_thumbnail,
710 class BandcampIE(InfoExtractor):
711 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
713 def _real_extract(self, url):
714 mobj = re.match(self._VALID_URL, url)
715 title = mobj.group('title')
716 webpage = self._download_webpage(url, title)
717 # We get the link to the free download page
718 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
719 if m_download is None:
720 raise ExtractorError(u'No free songs found')
722 download_link = m_download.group(1)
723 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
724 webpage, re.MULTILINE|re.DOTALL).group('id')
726 download_webpage = self._download_webpage(download_link, id,
727 'Downloading free downloads page')
728 # We get the dictionary of the track from some javascrip code
729 info = re.search(r'items: (.*?),$',
730 download_webpage, re.MULTILINE).group(1)
731 info = json.loads(info)[0]
732 # We pick mp3-320 for now, until format selection can be easily implemented.
733 mp3_info = info[u'downloads'][u'mp3-320']
734 # If we try to use this url it says the link has expired
735 initial_url = mp3_info[u'url']
736 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
737 m_url = re.match(re_url, initial_url)
738 #We build the url we will use to get the final track url
739 # This url is build in Bandcamp in the script download_bunde_*.js
740 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
741 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
742 # If we could correctly generate the .rand field the url would be
743 #in the "download_url" key
744 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
746 track_info = {'id':id,
747 'title' : info[u'title'],
750 'thumbnail' : info[u'thumb_url'],
751 'uploader' : info[u'artist']
756 class RedTubeIE(InfoExtractor):
757 """Information Extractor for redtube"""
758 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
760 def _real_extract(self,url):
761 mobj = re.match(self._VALID_URL, url)
763 raise ExtractorError(u'Invalid URL: %s' % url)
765 video_id = mobj.group('id')
766 video_extension = 'mp4'
767 webpage = self._download_webpage(url, video_id)
769 self.report_extraction(video_id)
771 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
772 webpage, u'video URL')
774 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
780 'ext': video_extension,
781 'title': video_title,
784 class InaIE(InfoExtractor):
785 """Information Extractor for Ina.fr"""
786 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
788 def _real_extract(self,url):
789 mobj = re.match(self._VALID_URL, url)
791 video_id = mobj.group('id')
792 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
793 video_extension = 'mp4'
794 webpage = self._download_webpage(mrss_url, video_id)
796 self.report_extraction(video_id)
798 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
799 webpage, u'video URL')
801 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
807 'ext': video_extension,
808 'title': video_title,
811 class HowcastIE(InfoExtractor):
812 """Information Extractor for Howcast.com"""
813 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
815 def _real_extract(self, url):
816 mobj = re.match(self._VALID_URL, url)
818 video_id = mobj.group('id')
819 webpage_url = 'http://www.howcast.com/videos/' + video_id
820 webpage = self._download_webpage(webpage_url, video_id)
822 self.report_extraction(video_id)
824 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
825 webpage, u'video URL')
827 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
830 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
831 webpage, u'description', fatal=False)
833 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
834 webpage, u'thumbnail', fatal=False)
840 'title': video_title,
841 'description': video_description,
842 'thumbnail': thumbnail,
845 class VineIE(InfoExtractor):
846 """Information Extractor for Vine.co"""
847 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
849 def _real_extract(self, url):
850 mobj = re.match(self._VALID_URL, url)
852 video_id = mobj.group('id')
853 webpage_url = 'https://vine.co/v/' + video_id
854 webpage = self._download_webpage(webpage_url, video_id)
856 self.report_extraction(video_id)
858 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
859 webpage, u'video URL')
861 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
864 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
865 webpage, u'thumbnail', fatal=False)
867 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
868 webpage, u'uploader', fatal=False, flags=re.DOTALL)
874 'title': video_title,
875 'thumbnail': thumbnail,
876 'uploader': uploader,
879 class FlickrIE(InfoExtractor):
880 """Information Extractor for Flickr videos"""
881 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
883 def _real_extract(self, url):
884 mobj = re.match(self._VALID_URL, url)
886 video_id = mobj.group('id')
887 video_uploader_id = mobj.group('uploader_id')
888 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
889 webpage = self._download_webpage(webpage_url, video_id)
891 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
893 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
894 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
896 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
897 first_xml, u'node_id')
899 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
900 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
902 self.report_extraction(video_id)
904 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
906 raise ExtractorError(u'Unable to extract video url')
907 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
909 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
910 webpage, u'video title')
912 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
913 webpage, u'description', fatal=False)
915 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
916 webpage, u'thumbnail', fatal=False)
922 'title': video_title,
923 'description': video_description,
924 'thumbnail': thumbnail,
925 'uploader_id': video_uploader_id,
928 class TeamcocoIE(InfoExtractor):
929 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
931 def _real_extract(self, url):
932 mobj = re.match(self._VALID_URL, url)
934 raise ExtractorError(u'Invalid URL: %s' % url)
935 url_title = mobj.group('url_title')
936 webpage = self._download_webpage(url, url_title)
938 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
939 webpage, u'video id')
941 self.report_extraction(video_id)
943 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
946 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
947 webpage, u'thumbnail', fatal=False)
949 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
950 webpage, u'description', fatal=False)
952 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
953 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
955 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
962 'title': video_title,
963 'thumbnail': thumbnail,
964 'description': video_description,
967 class XHamsterIE(InfoExtractor):
968 """Information Extractor for xHamster"""
969 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
971 def _real_extract(self,url):
972 mobj = re.match(self._VALID_URL, url)
974 video_id = mobj.group('id')
975 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
976 webpage = self._download_webpage(mrss_url, video_id)
978 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
980 raise ExtractorError(u'Unable to extract media URL')
981 if len(mobj.group('server')) == 0:
982 video_url = compat_urllib_parse.unquote(mobj.group('file'))
984 video_url = mobj.group('server')+'/key='+mobj.group('file')
985 video_extension = video_url.split('.')[-1]
987 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
990 # Can't see the description anywhere in the UI
991 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
992 # webpage, u'description', fatal=False)
993 # if video_description: video_description = unescapeHTML(video_description)
995 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
997 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
999 video_upload_date = None
1000 self._downloader.report_warning(u'Unable to extract upload date')
1002 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1003 webpage, u'uploader id', default=u'anonymous')
1005 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1006 webpage, u'thumbnail', fatal=False)
1011 'ext': video_extension,
1012 'title': video_title,
1013 # 'description': video_description,
1014 'upload_date': video_upload_date,
1015 'uploader_id': video_uploader_id,
1016 'thumbnail': video_thumbnail
1019 class HypemIE(InfoExtractor):
1020 """Information Extractor for hypem"""
1021 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1023 def _real_extract(self, url):
1024 mobj = re.match(self._VALID_URL, url)
1026 raise ExtractorError(u'Invalid URL: %s' % url)
1027 track_id = mobj.group(1)
1029 data = { 'ax': 1, 'ts': time.time() }
1030 data_encoded = compat_urllib_parse.urlencode(data)
1031 complete_url = url + "?" + data_encoded
1032 request = compat_urllib_request.Request(complete_url)
1033 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1034 cookie = urlh.headers.get('Set-Cookie', '')
1036 self.report_extraction(track_id)
1038 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1039 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1041 track_list = json.loads(html_tracks)
1042 track = track_list[u'tracks'][0]
1044 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1047 track_id = track[u"id"]
1048 artist = track[u"artist"]
1049 title = track[u"song"]
1051 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1052 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1053 request.add_header('cookie', cookie)
1054 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1056 song_data = json.loads(song_data_json)
1058 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1059 final_url = song_data[u"url"]
1069 class Vbox7IE(InfoExtractor):
1070 """Information Extractor for Vbox7"""
1071 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1073 def _real_extract(self,url):
1074 mobj = re.match(self._VALID_URL, url)
1076 raise ExtractorError(u'Invalid URL: %s' % url)
1077 video_id = mobj.group(1)
1079 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1080 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1081 redirect_url = urlh.geturl() + new_location
1082 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1084 title = self._html_search_regex(r'<title>(.*)</title>',
1085 webpage, u'title').split('/')[0].strip()
1088 info_url = "http://vbox7.com/play/magare.do"
1089 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1090 info_request = compat_urllib_request.Request(info_url, data)
1091 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1092 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1093 if info_response is None:
1094 raise ExtractorError(u'Unable to extract the media url')
1095 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1102 'thumbnail': thumbnail_url,
1106 def gen_extractors():
1107 """ Return a list of an instance of every supported extractor.
1108 The order does matter; the first extractor matched is the one handling the URL.
1111 YoutubePlaylistIE(),
1136 StanfordOpenClassroomIE(),
1146 WorldStarHipHopIE(),
1176 def get_info_extractor(ie_name):
1177 """Returns the info extractor class with the given ie_name"""
1178 return globals()[ie_name+'IE']