10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.metacafe import MetacafeIE
35 from .extractor.myvideo import MyVideoIE
36 from .extractor.statigram import StatigramIE
37 from .extractor.photobucket import PhotobucketIE
38 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
39 from .extractor.vimeo import VimeoIE
40 from .extractor.xvideos import XVideosIE
41 from .extractor.yahoo import YahooIE, YahooSearchIE
42 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
43 from .extractor.zdf import ZDFIE
47 class InfoQIE(InfoExtractor):
48 """Information extractor for infoq.com"""
49 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
51 def _real_extract(self, url):
52 mobj = re.match(self._VALID_URL, url)
54 raise ExtractorError(u'Invalid URL: %s' % url)
56 webpage = self._download_webpage(url, video_id=url)
57 self.report_extraction(url)
60 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
62 raise ExtractorError(u'Unable to extract video url')
63 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
64 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
67 video_title = self._search_regex(r'contentTitle = "(.*?)";',
71 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
72 webpage, u'description', fatal=False)
74 video_filename = video_url.split('/')[-1]
75 video_id, extension = video_filename.split('.')
83 'ext': extension, # Extension is always(?) mp4, but seems to be flv
85 'description': video_description,
90 class MixcloudIE(InfoExtractor):
91 """Information extractor for www.mixcloud.com"""
93 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
94 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
97 def report_download_json(self, file_id):
98 """Report JSON download."""
99 self.to_screen(u'Downloading json')
101 def get_urls(self, jsonData, fmt, bitrate='best'):
102 """Get urls from 'audio_formats' section in json"""
105 bitrate_list = jsonData[fmt]
106 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
107 bitrate = max(bitrate_list) # select highest
109 url_list = jsonData[fmt][bitrate]
110 except TypeError: # we have no bitrate info.
111 url_list = jsonData[fmt]
114 def check_urls(self, url_list):
115 """Returns 1st active url from list"""
118 compat_urllib_request.urlopen(url)
120 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
125 def _print_formats(self, formats):
126 print('Available formats:')
127 for fmt in formats.keys():
128 for b in formats[fmt]:
130 ext = formats[fmt][b][0]
131 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
132 except TypeError: # we have no bitrate info
133 ext = formats[fmt][0]
134 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
137 def _real_extract(self, url):
138 mobj = re.match(self._VALID_URL, url)
140 raise ExtractorError(u'Invalid URL: %s' % url)
141 # extract uploader & filename from url
142 uploader = mobj.group(1).decode('utf-8')
143 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
145 # construct API request
146 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
147 # retrieve .json file with links to files
148 request = compat_urllib_request.Request(file_url)
150 self.report_download_json(file_url)
151 jsonData = compat_urllib_request.urlopen(request).read()
152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
153 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
156 json_data = json.loads(jsonData)
157 player_url = json_data['player_swf_url']
158 formats = dict(json_data['audio_formats'])
160 req_format = self._downloader.params.get('format', None)
163 if self._downloader.params.get('listformats', None):
164 self._print_formats(formats)
167 if req_format is None or req_format == 'best':
168 for format_param in formats.keys():
169 url_list = self.get_urls(formats, format_param)
171 file_url = self.check_urls(url_list)
172 if file_url is not None:
175 if req_format not in formats:
176 raise ExtractorError(u'Format is not available')
178 url_list = self.get_urls(formats, req_format)
179 file_url = self.check_urls(url_list)
180 format_param = req_format
183 'id': file_id.decode('utf-8'),
184 'url': file_url.decode('utf-8'),
185 'uploader': uploader.decode('utf-8'),
187 'title': json_data['name'],
188 'ext': file_url.split('.')[-1].decode('utf-8'),
189 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
190 'thumbnail': json_data['thumbnail_url'],
191 'description': json_data['description'],
192 'player_url': player_url.decode('utf-8'),
195 class StanfordOpenClassroomIE(InfoExtractor):
196 """Information extractor for Stanford's Open ClassRoom"""
198 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
199 IE_NAME = u'stanfordoc'
201 def _real_extract(self, url):
202 mobj = re.match(self._VALID_URL, url)
204 raise ExtractorError(u'Invalid URL: %s' % url)
206 if mobj.group('course') and mobj.group('video'): # A specific video
207 course = mobj.group('course')
208 video = mobj.group('video')
210 'id': course + '_' + video,
215 self.report_extraction(info['id'])
216 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
217 xmlUrl = baseUrl + video + '.xml'
219 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
220 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
221 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
222 mdoc = xml.etree.ElementTree.fromstring(metaXml)
224 info['title'] = mdoc.findall('./title')[0].text
225 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
227 raise ExtractorError(u'Invalid metadata XML file')
228 info['ext'] = info['url'].rpartition('.')[2]
230 elif mobj.group('course'): # A course page
231 course = mobj.group('course')
239 coursepage = self._download_webpage(url, info['id'],
240 note='Downloading course info page',
241 errnote='Unable to download course info page')
243 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
245 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
246 coursepage, u'description', fatal=False)
248 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
252 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
256 for entry in info['list']:
257 assert entry['type'] == 'reference'
258 results += self.extract(entry['url'])
262 'id': 'Stanford OpenClassroom',
268 self.report_download_webpage(info['id'])
269 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
271 rootpage = compat_urllib_request.urlopen(rootURL).read()
272 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
273 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
275 info['title'] = info['id']
277 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
281 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
286 for entry in info['list']:
287 assert entry['type'] == 'reference'
288 results += self.extract(entry['url'])
291 class MTVIE(InfoExtractor):
292 """Information extractor for MTV.com"""
294 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
297 def _real_extract(self, url):
298 mobj = re.match(self._VALID_URL, url)
300 raise ExtractorError(u'Invalid URL: %s' % url)
301 if not mobj.group('proto'):
302 url = 'http://' + url
303 video_id = mobj.group('videoid')
305 webpage = self._download_webpage(url, video_id)
307 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
308 webpage, u'song name', fatal=False)
310 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
313 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
314 webpage, u'mtvn_uri', fatal=False)
316 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
317 webpage, u'content id', fatal=False)
319 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
320 self.report_extraction(video_id)
321 request = compat_urllib_request.Request(videogen_url)
323 metadataXml = compat_urllib_request.urlopen(request).read()
324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
325 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
327 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
328 renditions = mdoc.findall('.//rendition')
330 # For now, always pick the highest quality.
331 rendition = renditions[-1]
334 _,_,ext = rendition.attrib['type'].partition('/')
335 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
336 video_url = rendition.find('./src').text
338 raise ExtractorError('Invalid rendition field.')
343 'uploader': performer,
345 'title': video_title,
353 class YoukuIE(InfoExtractor):
354 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
357 nowTime = int(time.time() * 1000)
358 random1 = random.randint(1000,1998)
359 random2 = random.randint(1000,9999)
361 return "%d%d%d" %(nowTime,random1,random2)
363 def _get_file_ID_mix_string(self, seed):
365 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
367 for i in range(len(source)):
368 seed = (seed * 211 + 30031 ) % 65536
369 index = math.floor(seed / 65536 * len(source) )
370 mixed.append(source[int(index)])
371 source.remove(source[int(index)])
372 #return ''.join(mixed)
375 def _get_file_id(self, fileId, seed):
376 mixed = self._get_file_ID_mix_string(seed)
377 ids = fileId.split('*')
381 realId.append(mixed[int(ch)])
382 return ''.join(realId)
384 def _real_extract(self, url):
385 mobj = re.match(self._VALID_URL, url)
387 raise ExtractorError(u'Invalid URL: %s' % url)
388 video_id = mobj.group('ID')
390 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
392 jsondata = self._download_webpage(info_url, video_id)
394 self.report_extraction(video_id)
396 config = json.loads(jsondata)
398 video_title = config['data'][0]['title']
399 seed = config['data'][0]['seed']
401 format = self._downloader.params.get('format', None)
402 supported_format = list(config['data'][0]['streamfileids'].keys())
404 if format is None or format == 'best':
405 if 'hd2' in supported_format:
410 elif format == 'worst':
418 fileid = config['data'][0]['streamfileids'][format]
419 keys = [s['k'] for s in config['data'][0]['segs'][format]]
420 except (UnicodeDecodeError, ValueError, KeyError):
421 raise ExtractorError(u'Unable to extract info section')
424 sid = self._gen_sid()
425 fileid = self._get_file_id(fileid, seed)
427 #column 8,9 of fileid represent the segment number
428 #fileid[7:9] should be changed
429 for index, key in enumerate(keys):
431 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
432 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
435 'id': '%s_part%02d' % (video_id, index),
439 'title': video_title,
442 files_info.append(info)
447 class XNXXIE(InfoExtractor):
448 """Information extractor for xnxx.com"""
450 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
452 VIDEO_URL_RE = r'flv_url=(.*?)&'
453 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
454 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
456 def _real_extract(self, url):
457 mobj = re.match(self._VALID_URL, url)
459 raise ExtractorError(u'Invalid URL: %s' % url)
460 video_id = mobj.group(1)
462 # Get webpage content
463 webpage = self._download_webpage(url, video_id)
465 video_url = self._search_regex(self.VIDEO_URL_RE,
466 webpage, u'video URL')
467 video_url = compat_urllib_parse.unquote(video_url)
469 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
472 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
473 webpage, u'thumbnail', fatal=False)
480 'title': video_title,
482 'thumbnail': video_thumbnail,
488 class NBAIE(InfoExtractor):
489 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
492 def _real_extract(self, url):
493 mobj = re.match(self._VALID_URL, url)
495 raise ExtractorError(u'Invalid URL: %s' % url)
497 video_id = mobj.group(1)
499 webpage = self._download_webpage(url, video_id)
501 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
503 shortened_video_id = video_id.rpartition('/')[2]
504 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
505 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
507 # It isn't there in the HTML it returns to us
508 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
510 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
513 'id': shortened_video_id,
517 # 'uploader_date': uploader_date,
518 'description': description,
522 class JustinTVIE(InfoExtractor):
523 """Information extractor for justin.tv and twitch.tv"""
524 # TODO: One broadcast may be split into multiple videos. The key
525 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
526 # starts at 1 and increases. Can we treat all parts as one video?
528 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
530 (?P<channelid>[^/]+)|
531 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
532 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
536 _JUSTIN_PAGE_LIMIT = 100
537 IE_NAME = u'justin.tv'
539 def report_download_page(self, channel, offset):
540 """Report attempt to download a single page of videos."""
541 self.to_screen(u'%s: Downloading video information from %d to %d' %
542 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
544 # Return count of items, list of *valid* items
545 def _parse_page(self, url, video_id):
546 webpage = self._download_webpage(url, video_id,
547 u'Downloading video info JSON',
548 u'unable to download video info JSON')
550 response = json.loads(webpage)
551 if type(response) != list:
552 error_text = response.get('error', 'unknown error')
553 raise ExtractorError(u'Justin.tv API: %s' % error_text)
555 for clip in response:
556 video_url = clip['video_file_url']
558 video_extension = os.path.splitext(video_url)[1][1:]
559 video_date = re.sub('-', '', clip['start_time'][:10])
560 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
561 video_id = clip['id']
562 video_title = clip.get('title', video_id)
566 'title': video_title,
567 'uploader': clip.get('channel_name', video_uploader_id),
568 'uploader_id': video_uploader_id,
569 'upload_date': video_date,
570 'ext': video_extension,
572 return (len(response), info)
574 def _real_extract(self, url):
575 mobj = re.match(self._VALID_URL, url)
577 raise ExtractorError(u'invalid URL: %s' % url)
579 api_base = 'http://api.justin.tv'
581 if mobj.group('channelid'):
583 video_id = mobj.group('channelid')
584 api = api_base + '/channel/archives/%s.json' % video_id
585 elif mobj.group('chapterid'):
586 chapter_id = mobj.group('chapterid')
588 webpage = self._download_webpage(url, chapter_id)
589 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
591 raise ExtractorError(u'Cannot find archive of a chapter')
592 archive_id = m.group(1)
594 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
595 chapter_info_xml = self._download_webpage(api, chapter_id,
596 note=u'Downloading chapter information',
597 errnote=u'Chapter information download failed')
598 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
599 for a in doc.findall('.//archive'):
600 if archive_id == a.find('./id').text:
603 raise ExtractorError(u'Could not find chapter in chapter information')
605 video_url = a.find('./video_file_url').text
606 video_ext = video_url.rpartition('.')[2] or u'flv'
608 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
609 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
610 note='Downloading chapter metadata',
611 errnote='Download of chapter metadata failed')
612 chapter_info = json.loads(chapter_info_json)
614 bracket_start = int(doc.find('.//bracket_start').text)
615 bracket_end = int(doc.find('.//bracket_end').text)
617 # TODO determine start (and probably fix up file)
618 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
619 #video_url += u'?start=' + TODO:start_timestamp
620 # bracket_start is 13290, but we want 51670615
621 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
622 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
625 'id': u'c' + chapter_id,
628 'title': chapter_info['title'],
629 'thumbnail': chapter_info['preview'],
630 'description': chapter_info['description'],
631 'uploader': chapter_info['channel']['display_name'],
632 'uploader_id': chapter_info['channel']['name'],
636 video_id = mobj.group('videoid')
637 api = api_base + '/broadcast/by_archive/%s.json' % video_id
639 self.report_extraction(video_id)
643 limit = self._JUSTIN_PAGE_LIMIT
646 self.report_download_page(video_id, offset)
647 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
648 page_count, page_info = self._parse_page(page_url, video_id)
649 info.extend(page_info)
650 if not paged or page_count != limit:
655 class FunnyOrDieIE(InfoExtractor):
656 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
658 def _real_extract(self, url):
659 mobj = re.match(self._VALID_URL, url)
661 raise ExtractorError(u'invalid URL: %s' % url)
663 video_id = mobj.group('id')
664 webpage = self._download_webpage(url, video_id)
666 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
667 webpage, u'video URL', flags=re.DOTALL)
669 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
670 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
672 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
673 webpage, u'description', fatal=False, flags=re.DOTALL)
680 'description': video_description,
684 class SteamIE(InfoExtractor):
685 _VALID_URL = r"""http://store\.steampowered\.com/
687 (?P<urltype>video|app)/ #If the page is only for videos or for a game
689 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
691 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
692 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
695 def suitable(cls, url):
696 """Receives a URL and returns True if suitable for this IE."""
697 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
699 def _real_extract(self, url):
700 m = re.match(self._VALID_URL, url, re.VERBOSE)
701 gameID = m.group('gameID')
703 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
704 webpage = self._download_webpage(videourl, gameID)
706 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
707 videourl = self._AGECHECK_TEMPLATE % gameID
708 self.report_age_confirmation()
709 webpage = self._download_webpage(videourl, gameID)
711 self.report_extraction(gameID)
712 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
713 webpage, 'game title')
715 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
716 mweb = re.finditer(urlRE, webpage)
717 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
718 titles = re.finditer(namesRE, webpage)
719 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
720 thumbs = re.finditer(thumbsRE, webpage)
722 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
723 video_id = vid.group('videoID')
724 title = vtitle.group('videoName')
725 video_url = vid.group('videoURL')
726 video_thumb = thumb.group('thumbnail')
728 raise ExtractorError(u'Cannot find video url for %s' % video_id)
733 'title': unescapeHTML(title),
734 'thumbnail': video_thumb
737 return [self.playlist_result(videos, gameID, game_title)]
739 class UstreamIE(InfoExtractor):
740 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
743 def _real_extract(self, url):
744 m = re.match(self._VALID_URL, url)
745 video_id = m.group('videoID')
747 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
748 webpage = self._download_webpage(url, video_id)
750 self.report_extraction(video_id)
752 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
755 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
756 webpage, u'uploader', fatal=False, flags=re.DOTALL)
758 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
759 webpage, u'thumbnail', fatal=False)
765 'title': video_title,
766 'uploader': uploader,
767 'thumbnail': thumbnail,
771 class WorldStarHipHopIE(InfoExtractor):
772 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
773 IE_NAME = u'WorldStarHipHop'
775 def _real_extract(self, url):
776 m = re.match(self._VALID_URL, url)
777 video_id = m.group('id')
779 webpage_src = self._download_webpage(url, video_id)
781 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
782 webpage_src, u'video URL')
784 if 'mp4' in video_url:
789 video_title = self._html_search_regex(r"<title>(.*)</title>",
790 webpage_src, u'title')
792 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
793 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
794 webpage_src, u'thumbnail', fatal=False)
797 _title = r"""candytitles.*>(.*)</span>"""
798 mobj = re.search(_title, webpage_src)
800 video_title = mobj.group(1)
805 'title' : video_title,
806 'thumbnail' : thumbnail,
811 class RBMARadioIE(InfoExtractor):
812 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
814 def _real_extract(self, url):
815 m = re.match(self._VALID_URL, url)
816 video_id = m.group('videoID')
818 webpage = self._download_webpage(url, video_id)
820 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
821 webpage, u'json data', flags=re.MULTILINE)
824 data = json.loads(json_data)
825 except ValueError as e:
826 raise ExtractorError(u'Invalid JSON: ' + str(e))
828 video_url = data['akamai_url'] + '&cbr=256'
829 url_parts = compat_urllib_parse_urlparse(video_url)
830 video_ext = url_parts.path.rpartition('.')[2]
835 'title': data['title'],
836 'description': data.get('teaser_text'),
837 'location': data.get('country_of_origin'),
838 'uploader': data.get('host', {}).get('name'),
839 'uploader_id': data.get('host', {}).get('slug'),
840 'thumbnail': data.get('image', {}).get('large_url_2x'),
841 'duration': data.get('duration'),
846 class YouPornIE(InfoExtractor):
847 """Information extractor for youporn.com."""
848 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
850 def _print_formats(self, formats):
851 """Print all available formats"""
852 print(u'Available formats:')
853 print(u'ext\t\tformat')
854 print(u'---------------------------------')
855 for format in formats:
856 print(u'%s\t\t%s' % (format['ext'], format['format']))
858 def _specific(self, req_format, formats):
860 if(x["format"]==req_format):
864 def _real_extract(self, url):
865 mobj = re.match(self._VALID_URL, url)
867 raise ExtractorError(u'Invalid URL: %s' % url)
868 video_id = mobj.group('videoid')
870 req = compat_urllib_request.Request(url)
871 req.add_header('Cookie', 'age_verified=1')
872 webpage = self._download_webpage(req, video_id)
874 # Get JSON parameters
875 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
877 params = json.loads(json_params)
879 raise ExtractorError(u'Invalid JSON')
881 self.report_extraction(video_id)
883 video_title = params['title']
884 upload_date = unified_strdate(params['release_date_f'])
885 video_description = params['description']
886 video_uploader = params['submitted_by']
887 thumbnail = params['thumbnails'][0]['image']
889 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
891 # Get all of the formats available
892 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
893 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
894 webpage, u'download list').strip()
896 # Get all of the links from the page
897 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
898 links = re.findall(LINK_RE, download_list_html)
900 raise ExtractorError(u'ERROR: no known formats available for video')
902 self.to_screen(u'Links found: %d' % len(links))
907 # A link looks like this:
908 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
909 # A path looks like this:
910 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
911 video_url = unescapeHTML( link )
912 path = compat_urllib_parse_urlparse( video_url ).path
913 extension = os.path.splitext( path )[1][1:]
914 format = path.split('/')[4].split('_')[:2]
917 format = "-".join( format )
918 # title = u'%s-%s-%s' % (video_title, size, bitrate)
923 'uploader': video_uploader,
924 'upload_date': upload_date,
925 'title': video_title,
928 'thumbnail': thumbnail,
929 'description': video_description
932 if self._downloader.params.get('listformats', None):
933 self._print_formats(formats)
936 req_format = self._downloader.params.get('format', None)
937 self.to_screen(u'Format: %s' % req_format)
939 if req_format is None or req_format == 'best':
941 elif req_format == 'worst':
943 elif req_format in ('-1', 'all'):
946 format = self._specific( req_format, formats )
948 raise ExtractorError(u'Requested format not available')
953 class PornotubeIE(InfoExtractor):
954 """Information extractor for pornotube.com."""
955 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
957 def _real_extract(self, url):
958 mobj = re.match(self._VALID_URL, url)
960 raise ExtractorError(u'Invalid URL: %s' % url)
962 video_id = mobj.group('videoid')
963 video_title = mobj.group('title')
965 # Get webpage content
966 webpage = self._download_webpage(url, video_id)
969 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
970 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
971 video_url = compat_urllib_parse.unquote(video_url)
973 #Get the uploaded date
974 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
975 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
976 if upload_date: upload_date = unified_strdate(upload_date)
978 info = {'id': video_id,
981 'upload_date': upload_date,
982 'title': video_title,
988 class YouJizzIE(InfoExtractor):
989 """Information extractor for youjizz.com."""
990 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
992 def _real_extract(self, url):
993 mobj = re.match(self._VALID_URL, url)
995 raise ExtractorError(u'Invalid URL: %s' % url)
997 video_id = mobj.group('videoid')
999 # Get webpage content
1000 webpage = self._download_webpage(url, video_id)
1002 # Get the video title
1003 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1004 webpage, u'title').strip()
1006 # Get the embed page
1007 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1009 raise ExtractorError(u'ERROR: unable to extract embed page')
1011 embed_page_url = result.group(0).strip()
1012 video_id = result.group('videoid')
1014 webpage = self._download_webpage(embed_page_url, video_id)
1017 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1018 webpage, u'video URL')
1020 info = {'id': video_id,
1022 'title': video_title,
1025 'player_url': embed_page_url}
1029 class EightTracksIE(InfoExtractor):
1031 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1033 def _real_extract(self, url):
1034 mobj = re.match(self._VALID_URL, url)
1036 raise ExtractorError(u'Invalid URL: %s' % url)
1037 playlist_id = mobj.group('id')
1039 webpage = self._download_webpage(url, playlist_id)
1041 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1042 data = json.loads(json_like)
1044 session = str(random.randint(0, 1000000000))
1046 track_count = data['tracks_count']
1047 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1048 next_url = first_url
1050 for i in itertools.count():
1051 api_json = self._download_webpage(next_url, playlist_id,
1052 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1053 errnote=u'Failed to download song information')
1054 api_data = json.loads(api_json)
1055 track_data = api_data[u'set']['track']
1057 'id': track_data['id'],
1058 'url': track_data['track_file_stream_url'],
1059 'title': track_data['performer'] + u' - ' + track_data['name'],
1060 'raw_title': track_data['name'],
1061 'uploader_id': data['user']['login'],
1065 if api_data['set']['at_last_track']:
1067 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1070 class KeekIE(InfoExtractor):
1071 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1074 def _real_extract(self, url):
1075 m = re.match(self._VALID_URL, url)
1076 video_id = m.group('videoID')
1078 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1079 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1080 webpage = self._download_webpage(url, video_id)
1082 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1085 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1086 webpage, u'uploader', fatal=False)
1092 'title': video_title,
1093 'thumbnail': thumbnail,
1094 'uploader': uploader
1098 class TEDIE(InfoExtractor):
1099 _VALID_URL=r'''http://www\.ted\.com/
1101 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1103 ((?P<type_talk>talks)) # We have a simple talk
1105 (/lang/(.*?))? # The url may contain the language
1106 /(?P<name>\w+) # Here goes the name and then ".html"
1110 def suitable(cls, url):
1111 """Receives a URL and returns True if suitable for this IE."""
1112 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1114 def _real_extract(self, url):
1115 m=re.match(self._VALID_URL, url, re.VERBOSE)
1116 if m.group('type_talk'):
1117 return [self._talk_info(url)]
1119 playlist_id=m.group('playlist_id')
1120 name=m.group('name')
1121 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1122 return [self._playlist_videos_info(url,name,playlist_id)]
1124 def _playlist_videos_info(self,url,name,playlist_id=0):
1125 '''Returns the videos of the playlist'''
1127 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1128 ([.\s]*?)data-playlist_item_id="(\d+)"
1129 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1131 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1132 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1133 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1134 m_names=re.finditer(video_name_RE,webpage)
1136 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1137 webpage, 'playlist title')
1139 playlist_entries = []
1140 for m_video, m_name in zip(m_videos,m_names):
1141 video_id=m_video.group('video_id')
1142 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1143 playlist_entries.append(self.url_result(talk_url, 'TED'))
1144 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1146 def _talk_info(self, url, video_id=0):
1147 """Return the video for the talk in the url"""
1148 m = re.match(self._VALID_URL, url,re.VERBOSE)
1149 video_name = m.group('name')
1150 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1151 self.report_extraction(video_name)
1152 # If the url includes the language we get the title translated
1153 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1155 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1156 webpage, 'json data')
1157 info = json.loads(json_data)
1158 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1159 webpage, 'description', flags = re.DOTALL)
1161 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1162 webpage, 'thumbnail')
1165 'url': info['htmlStreams'][-1]['file'],
1168 'thumbnail': thumbnail,
1169 'description': desc,
1173 class MySpassIE(InfoExtractor):
1174 _VALID_URL = r'http://www.myspass.de/.*'
1176 def _real_extract(self, url):
1177 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1179 # video id is the last path element of the URL
1180 # usually there is a trailing slash, so also try the second but last
1181 url_path = compat_urllib_parse_urlparse(url).path
1182 url_parent_path, video_id = os.path.split(url_path)
1184 _, video_id = os.path.split(url_parent_path)
1187 metadata_url = META_DATA_URL_TEMPLATE % video_id
1188 metadata_text = self._download_webpage(metadata_url, video_id)
1189 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1191 # extract values from metadata
1192 url_flv_el = metadata.find('url_flv')
1193 if url_flv_el is None:
1194 raise ExtractorError(u'Unable to extract download url')
1195 video_url = url_flv_el.text
1196 extension = os.path.splitext(video_url)[1][1:]
1197 title_el = metadata.find('title')
1198 if title_el is None:
1199 raise ExtractorError(u'Unable to extract title')
1200 title = title_el.text
1201 format_id_el = metadata.find('format_id')
1202 if format_id_el is None:
1205 format = format_id_el.text
1206 description_el = metadata.find('description')
1207 if description_el is not None:
1208 description = description_el.text
1211 imagePreview_el = metadata.find('imagePreview')
1212 if imagePreview_el is not None:
1213 thumbnail = imagePreview_el.text
1222 'thumbnail': thumbnail,
1223 'description': description
1227 class SpiegelIE(InfoExtractor):
1228 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1230 def _real_extract(self, url):
1231 m = re.match(self._VALID_URL, url)
1232 video_id = m.group('videoID')
1234 webpage = self._download_webpage(url, video_id)
1236 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1239 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1240 xml_code = self._download_webpage(xml_url, video_id,
1241 note=u'Downloading XML', errnote=u'Failed to download XML')
1243 idoc = xml.etree.ElementTree.fromstring(xml_code)
1244 last_type = idoc[-1]
1245 filename = last_type.findall('./filename')[0].text
1246 duration = float(last_type.findall('./duration')[0].text)
1248 video_url = 'http://video2.spiegel.de/flash/' + filename
1249 video_ext = filename.rpartition('.')[2]
1254 'title': video_title,
1255 'duration': duration,
1259 class LiveLeakIE(InfoExtractor):
1261 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1262 IE_NAME = u'liveleak'
1264 def _real_extract(self, url):
1265 mobj = re.match(self._VALID_URL, url)
1267 raise ExtractorError(u'Invalid URL: %s' % url)
1269 video_id = mobj.group('video_id')
1271 webpage = self._download_webpage(url, video_id)
1273 video_url = self._search_regex(r'file: "(.*?)",',
1274 webpage, u'video URL')
1276 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1277 webpage, u'title').replace('LiveLeak.com -', '').strip()
1279 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1280 webpage, u'description', fatal=False)
1282 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1283 webpage, u'uploader', fatal=False)
1289 'title': video_title,
1290 'description': video_description,
1291 'uploader': video_uploader
1298 class TumblrIE(InfoExtractor):
1299 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1301 def _real_extract(self, url):
1302 m_url = re.match(self._VALID_URL, url)
1303 video_id = m_url.group('id')
1304 blog = m_url.group('blog_name')
1306 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1307 webpage = self._download_webpage(url, video_id)
1309 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1310 video = re.search(re_video, webpage)
1312 raise ExtractorError(u'Unable to extract video')
1313 video_url = video.group('video_url')
1314 ext = video.group('ext')
1316 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1317 webpage, u'thumbnail', fatal=False) # We pick the first poster
1318 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1320 # The only place where you can get a title, it's not complete,
1321 # but searching in other places doesn't work for all videos
1322 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1323 webpage, u'title', flags=re.DOTALL)
1325 return [{'id': video_id,
1327 'title': video_title,
1328 'thumbnail': video_thumbnail,
1332 class BandcampIE(InfoExtractor):
1333 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1335 def _real_extract(self, url):
1336 mobj = re.match(self._VALID_URL, url)
1337 title = mobj.group('title')
1338 webpage = self._download_webpage(url, title)
1339 # We get the link to the free download page
1340 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1341 if m_download is None:
1342 raise ExtractorError(u'No free songs found')
1344 download_link = m_download.group(1)
1345 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1346 webpage, re.MULTILINE|re.DOTALL).group('id')
1348 download_webpage = self._download_webpage(download_link, id,
1349 'Downloading free downloads page')
1350 # We get the dictionary of the track from some javascrip code
1351 info = re.search(r'items: (.*?),$',
1352 download_webpage, re.MULTILINE).group(1)
1353 info = json.loads(info)[0]
1354 # We pick mp3-320 for now, until format selection can be easily implemented.
1355 mp3_info = info[u'downloads'][u'mp3-320']
1356 # If we try to use this url it says the link has expired
1357 initial_url = mp3_info[u'url']
1358 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1359 m_url = re.match(re_url, initial_url)
1360 #We build the url we will use to get the final track url
1361 # This url is build in Bandcamp in the script download_bunde_*.js
1362 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1363 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1364 # If we could correctly generate the .rand field the url would be
1365 #in the "download_url" key
1366 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1368 track_info = {'id':id,
1369 'title' : info[u'title'],
1372 'thumbnail' : info[u'thumb_url'],
1373 'uploader' : info[u'artist']
1378 class RedTubeIE(InfoExtractor):
1379 """Information Extractor for redtube"""
1380 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1382 def _real_extract(self,url):
1383 mobj = re.match(self._VALID_URL, url)
1385 raise ExtractorError(u'Invalid URL: %s' % url)
1387 video_id = mobj.group('id')
1388 video_extension = 'mp4'
1389 webpage = self._download_webpage(url, video_id)
1391 self.report_extraction(video_id)
1393 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1394 webpage, u'video URL')
1396 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1402 'ext': video_extension,
1403 'title': video_title,
1406 class InaIE(InfoExtractor):
1407 """Information Extractor for Ina.fr"""
1408 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1410 def _real_extract(self,url):
1411 mobj = re.match(self._VALID_URL, url)
1413 video_id = mobj.group('id')
1414 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1415 video_extension = 'mp4'
1416 webpage = self._download_webpage(mrss_url, video_id)
1418 self.report_extraction(video_id)
1420 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1421 webpage, u'video URL')
1423 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1429 'ext': video_extension,
1430 'title': video_title,
1433 class HowcastIE(InfoExtractor):
1434 """Information Extractor for Howcast.com"""
1435 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1437 def _real_extract(self, url):
1438 mobj = re.match(self._VALID_URL, url)
1440 video_id = mobj.group('id')
1441 webpage_url = 'http://www.howcast.com/videos/' + video_id
1442 webpage = self._download_webpage(webpage_url, video_id)
1444 self.report_extraction(video_id)
1446 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1447 webpage, u'video URL')
1449 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1452 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1453 webpage, u'description', fatal=False)
1455 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1456 webpage, u'thumbnail', fatal=False)
1462 'title': video_title,
1463 'description': video_description,
1464 'thumbnail': thumbnail,
1467 class VineIE(InfoExtractor):
1468 """Information Extractor for Vine.co"""
1469 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1471 def _real_extract(self, url):
1472 mobj = re.match(self._VALID_URL, url)
1474 video_id = mobj.group('id')
1475 webpage_url = 'https://vine.co/v/' + video_id
1476 webpage = self._download_webpage(webpage_url, video_id)
1478 self.report_extraction(video_id)
1480 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1481 webpage, u'video URL')
1483 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1486 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1487 webpage, u'thumbnail', fatal=False)
1489 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1490 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1496 'title': video_title,
1497 'thumbnail': thumbnail,
1498 'uploader': uploader,
1501 class FlickrIE(InfoExtractor):
1502 """Information Extractor for Flickr videos"""
1503 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1505 def _real_extract(self, url):
1506 mobj = re.match(self._VALID_URL, url)
1508 video_id = mobj.group('id')
1509 video_uploader_id = mobj.group('uploader_id')
1510 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1511 webpage = self._download_webpage(webpage_url, video_id)
1513 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1515 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1516 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1518 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1519 first_xml, u'node_id')
1521 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1522 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1524 self.report_extraction(video_id)
1526 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1528 raise ExtractorError(u'Unable to extract video url')
1529 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1531 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1532 webpage, u'video title')
1534 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1535 webpage, u'description', fatal=False)
1537 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1538 webpage, u'thumbnail', fatal=False)
1544 'title': video_title,
1545 'description': video_description,
1546 'thumbnail': thumbnail,
1547 'uploader_id': video_uploader_id,
1550 class TeamcocoIE(InfoExtractor):
1551 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1553 def _real_extract(self, url):
1554 mobj = re.match(self._VALID_URL, url)
1556 raise ExtractorError(u'Invalid URL: %s' % url)
1557 url_title = mobj.group('url_title')
1558 webpage = self._download_webpage(url, url_title)
1560 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1561 webpage, u'video id')
1563 self.report_extraction(video_id)
1565 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1568 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1569 webpage, u'thumbnail', fatal=False)
1571 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1572 webpage, u'description', fatal=False)
1574 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1575 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1577 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1584 'title': video_title,
1585 'thumbnail': thumbnail,
1586 'description': video_description,
1589 class XHamsterIE(InfoExtractor):
1590 """Information Extractor for xHamster"""
1591 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1593 def _real_extract(self,url):
1594 mobj = re.match(self._VALID_URL, url)
1596 video_id = mobj.group('id')
1597 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1598 webpage = self._download_webpage(mrss_url, video_id)
1600 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1602 raise ExtractorError(u'Unable to extract media URL')
1603 if len(mobj.group('server')) == 0:
1604 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1606 video_url = mobj.group('server')+'/key='+mobj.group('file')
1607 video_extension = video_url.split('.')[-1]
1609 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1612 # Can't see the description anywhere in the UI
1613 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1614 # webpage, u'description', fatal=False)
1615 # if video_description: video_description = unescapeHTML(video_description)
1617 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1619 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1621 video_upload_date = None
1622 self._downloader.report_warning(u'Unable to extract upload date')
1624 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1625 webpage, u'uploader id', default=u'anonymous')
1627 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1628 webpage, u'thumbnail', fatal=False)
1633 'ext': video_extension,
1634 'title': video_title,
1635 # 'description': video_description,
1636 'upload_date': video_upload_date,
1637 'uploader_id': video_uploader_id,
1638 'thumbnail': video_thumbnail
1641 class HypemIE(InfoExtractor):
1642 """Information Extractor for hypem"""
1643 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1645 def _real_extract(self, url):
1646 mobj = re.match(self._VALID_URL, url)
1648 raise ExtractorError(u'Invalid URL: %s' % url)
1649 track_id = mobj.group(1)
1651 data = { 'ax': 1, 'ts': time.time() }
1652 data_encoded = compat_urllib_parse.urlencode(data)
1653 complete_url = url + "?" + data_encoded
1654 request = compat_urllib_request.Request(complete_url)
1655 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1656 cookie = urlh.headers.get('Set-Cookie', '')
1658 self.report_extraction(track_id)
1660 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1661 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1663 track_list = json.loads(html_tracks)
1664 track = track_list[u'tracks'][0]
1666 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1669 track_id = track[u"id"]
1670 artist = track[u"artist"]
1671 title = track[u"song"]
1673 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1674 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1675 request.add_header('cookie', cookie)
1676 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1678 song_data = json.loads(song_data_json)
1680 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1681 final_url = song_data[u"url"]
1691 class Vbox7IE(InfoExtractor):
1692 """Information Extractor for Vbox7"""
1693 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1695 def _real_extract(self,url):
1696 mobj = re.match(self._VALID_URL, url)
1698 raise ExtractorError(u'Invalid URL: %s' % url)
1699 video_id = mobj.group(1)
1701 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1702 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1703 redirect_url = urlh.geturl() + new_location
1704 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1706 title = self._html_search_regex(r'<title>(.*)</title>',
1707 webpage, u'title').split('/')[0].strip()
1710 info_url = "http://vbox7.com/play/magare.do"
1711 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1712 info_request = compat_urllib_request.Request(info_url, data)
1713 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1714 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1715 if info_response is None:
1716 raise ExtractorError(u'Unable to extract the media url')
1717 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1724 'thumbnail': thumbnail_url,
1728 def gen_extractors():
1729 """ Return a list of an instance of every supported extractor.
1730 The order does matter; the first extractor matched is the one handling the URL.
1733 YoutubePlaylistIE(),
1758 StanfordOpenClassroomIE(),
1768 WorldStarHipHopIE(),
1798 def get_info_extractor(ie_name):
1799 """Returns the info extractor class with the given ie_name"""
1800 return globals()[ie_name+'IE']