10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.metacafe import MetacafeIE
35 from .extractor.myvideo import MyVideoIE
36 from .extractor.statigram import StatigramIE
37 from .extractor.photobucket import PhotobucketIE
38 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
39 from .extractor.vimeo import VimeoIE
40 from .extractor.yahoo import YahooIE, YahooSearchIE
41 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
42 from .extractor.zdf import ZDFIE
45 class XVideosIE(InfoExtractor):
46 """Information extractor for xvideos.com"""
48 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
51 def _real_extract(self, url):
52 mobj = re.match(self._VALID_URL, url)
54 raise ExtractorError(u'Invalid URL: %s' % url)
55 video_id = mobj.group(1)
57 webpage = self._download_webpage(url, video_id)
59 self.report_extraction(video_id)
62 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
63 webpage, u'video URL'))
66 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
69 # Extract video thumbnail
70 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
71 webpage, u'thumbnail', fatal=False)
80 'thumbnail': video_thumbnail,
89 class InfoQIE(InfoExtractor):
90 """Information extractor for infoq.com"""
91 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
93 def _real_extract(self, url):
94 mobj = re.match(self._VALID_URL, url)
96 raise ExtractorError(u'Invalid URL: %s' % url)
98 webpage = self._download_webpage(url, video_id=url)
99 self.report_extraction(url)
102 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
104 raise ExtractorError(u'Unable to extract video url')
105 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
106 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
109 video_title = self._search_regex(r'contentTitle = "(.*?)";',
112 # Extract description
113 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
114 webpage, u'description', fatal=False)
116 video_filename = video_url.split('/')[-1]
117 video_id, extension = video_filename.split('.')
124 'title': video_title,
125 'ext': extension, # Extension is always(?) mp4, but seems to be flv
127 'description': video_description,
132 class MixcloudIE(InfoExtractor):
133 """Information extractor for www.mixcloud.com"""
135 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
136 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
137 IE_NAME = u'mixcloud'
139 def report_download_json(self, file_id):
140 """Report JSON download."""
141 self.to_screen(u'Downloading json')
143 def get_urls(self, jsonData, fmt, bitrate='best'):
144 """Get urls from 'audio_formats' section in json"""
147 bitrate_list = jsonData[fmt]
148 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
149 bitrate = max(bitrate_list) # select highest
151 url_list = jsonData[fmt][bitrate]
152 except TypeError: # we have no bitrate info.
153 url_list = jsonData[fmt]
156 def check_urls(self, url_list):
157 """Returns 1st active url from list"""
160 compat_urllib_request.urlopen(url)
162 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
167 def _print_formats(self, formats):
168 print('Available formats:')
169 for fmt in formats.keys():
170 for b in formats[fmt]:
172 ext = formats[fmt][b][0]
173 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
174 except TypeError: # we have no bitrate info
175 ext = formats[fmt][0]
176 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
179 def _real_extract(self, url):
180 mobj = re.match(self._VALID_URL, url)
182 raise ExtractorError(u'Invalid URL: %s' % url)
183 # extract uploader & filename from url
184 uploader = mobj.group(1).decode('utf-8')
185 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
187 # construct API request
188 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
189 # retrieve .json file with links to files
190 request = compat_urllib_request.Request(file_url)
192 self.report_download_json(file_url)
193 jsonData = compat_urllib_request.urlopen(request).read()
194 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
195 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
198 json_data = json.loads(jsonData)
199 player_url = json_data['player_swf_url']
200 formats = dict(json_data['audio_formats'])
202 req_format = self._downloader.params.get('format', None)
205 if self._downloader.params.get('listformats', None):
206 self._print_formats(formats)
209 if req_format is None or req_format == 'best':
210 for format_param in formats.keys():
211 url_list = self.get_urls(formats, format_param)
213 file_url = self.check_urls(url_list)
214 if file_url is not None:
217 if req_format not in formats:
218 raise ExtractorError(u'Format is not available')
220 url_list = self.get_urls(formats, req_format)
221 file_url = self.check_urls(url_list)
222 format_param = req_format
225 'id': file_id.decode('utf-8'),
226 'url': file_url.decode('utf-8'),
227 'uploader': uploader.decode('utf-8'),
229 'title': json_data['name'],
230 'ext': file_url.split('.')[-1].decode('utf-8'),
231 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
232 'thumbnail': json_data['thumbnail_url'],
233 'description': json_data['description'],
234 'player_url': player_url.decode('utf-8'),
237 class StanfordOpenClassroomIE(InfoExtractor):
238 """Information extractor for Stanford's Open ClassRoom"""
240 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
241 IE_NAME = u'stanfordoc'
243 def _real_extract(self, url):
244 mobj = re.match(self._VALID_URL, url)
246 raise ExtractorError(u'Invalid URL: %s' % url)
248 if mobj.group('course') and mobj.group('video'): # A specific video
249 course = mobj.group('course')
250 video = mobj.group('video')
252 'id': course + '_' + video,
257 self.report_extraction(info['id'])
258 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
259 xmlUrl = baseUrl + video + '.xml'
261 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
263 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
264 mdoc = xml.etree.ElementTree.fromstring(metaXml)
266 info['title'] = mdoc.findall('./title')[0].text
267 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
269 raise ExtractorError(u'Invalid metadata XML file')
270 info['ext'] = info['url'].rpartition('.')[2]
272 elif mobj.group('course'): # A course page
273 course = mobj.group('course')
281 coursepage = self._download_webpage(url, info['id'],
282 note='Downloading course info page',
283 errnote='Unable to download course info page')
285 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
287 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
288 coursepage, u'description', fatal=False)
290 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
294 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
298 for entry in info['list']:
299 assert entry['type'] == 'reference'
300 results += self.extract(entry['url'])
304 'id': 'Stanford OpenClassroom',
310 self.report_download_webpage(info['id'])
311 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
313 rootpage = compat_urllib_request.urlopen(rootURL).read()
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
317 info['title'] = info['id']
319 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
323 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
328 for entry in info['list']:
329 assert entry['type'] == 'reference'
330 results += self.extract(entry['url'])
333 class MTVIE(InfoExtractor):
334 """Information extractor for MTV.com"""
336 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
339 def _real_extract(self, url):
340 mobj = re.match(self._VALID_URL, url)
342 raise ExtractorError(u'Invalid URL: %s' % url)
343 if not mobj.group('proto'):
344 url = 'http://' + url
345 video_id = mobj.group('videoid')
347 webpage = self._download_webpage(url, video_id)
349 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
350 webpage, u'song name', fatal=False)
352 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
355 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
356 webpage, u'mtvn_uri', fatal=False)
358 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
359 webpage, u'content id', fatal=False)
361 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
362 self.report_extraction(video_id)
363 request = compat_urllib_request.Request(videogen_url)
365 metadataXml = compat_urllib_request.urlopen(request).read()
366 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
367 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
369 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
370 renditions = mdoc.findall('.//rendition')
372 # For now, always pick the highest quality.
373 rendition = renditions[-1]
376 _,_,ext = rendition.attrib['type'].partition('/')
377 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
378 video_url = rendition.find('./src').text
380 raise ExtractorError('Invalid rendition field.')
385 'uploader': performer,
387 'title': video_title,
395 class YoukuIE(InfoExtractor):
396 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
399 nowTime = int(time.time() * 1000)
400 random1 = random.randint(1000,1998)
401 random2 = random.randint(1000,9999)
403 return "%d%d%d" %(nowTime,random1,random2)
405 def _get_file_ID_mix_string(self, seed):
407 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
409 for i in range(len(source)):
410 seed = (seed * 211 + 30031 ) % 65536
411 index = math.floor(seed / 65536 * len(source) )
412 mixed.append(source[int(index)])
413 source.remove(source[int(index)])
414 #return ''.join(mixed)
417 def _get_file_id(self, fileId, seed):
418 mixed = self._get_file_ID_mix_string(seed)
419 ids = fileId.split('*')
423 realId.append(mixed[int(ch)])
424 return ''.join(realId)
426 def _real_extract(self, url):
427 mobj = re.match(self._VALID_URL, url)
429 raise ExtractorError(u'Invalid URL: %s' % url)
430 video_id = mobj.group('ID')
432 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
434 jsondata = self._download_webpage(info_url, video_id)
436 self.report_extraction(video_id)
438 config = json.loads(jsondata)
440 video_title = config['data'][0]['title']
441 seed = config['data'][0]['seed']
443 format = self._downloader.params.get('format', None)
444 supported_format = list(config['data'][0]['streamfileids'].keys())
446 if format is None or format == 'best':
447 if 'hd2' in supported_format:
452 elif format == 'worst':
460 fileid = config['data'][0]['streamfileids'][format]
461 keys = [s['k'] for s in config['data'][0]['segs'][format]]
462 except (UnicodeDecodeError, ValueError, KeyError):
463 raise ExtractorError(u'Unable to extract info section')
466 sid = self._gen_sid()
467 fileid = self._get_file_id(fileid, seed)
469 #column 8,9 of fileid represent the segment number
470 #fileid[7:9] should be changed
471 for index, key in enumerate(keys):
473 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
474 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
477 'id': '%s_part%02d' % (video_id, index),
481 'title': video_title,
484 files_info.append(info)
489 class XNXXIE(InfoExtractor):
490 """Information extractor for xnxx.com"""
492 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
494 VIDEO_URL_RE = r'flv_url=(.*?)&'
495 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
496 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
498 def _real_extract(self, url):
499 mobj = re.match(self._VALID_URL, url)
501 raise ExtractorError(u'Invalid URL: %s' % url)
502 video_id = mobj.group(1)
504 # Get webpage content
505 webpage = self._download_webpage(url, video_id)
507 video_url = self._search_regex(self.VIDEO_URL_RE,
508 webpage, u'video URL')
509 video_url = compat_urllib_parse.unquote(video_url)
511 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
514 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
515 webpage, u'thumbnail', fatal=False)
522 'title': video_title,
524 'thumbnail': video_thumbnail,
530 class NBAIE(InfoExtractor):
531 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
534 def _real_extract(self, url):
535 mobj = re.match(self._VALID_URL, url)
537 raise ExtractorError(u'Invalid URL: %s' % url)
539 video_id = mobj.group(1)
541 webpage = self._download_webpage(url, video_id)
543 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
545 shortened_video_id = video_id.rpartition('/')[2]
546 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
547 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
549 # It isn't there in the HTML it returns to us
550 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
552 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
555 'id': shortened_video_id,
559 # 'uploader_date': uploader_date,
560 'description': description,
564 class JustinTVIE(InfoExtractor):
565 """Information extractor for justin.tv and twitch.tv"""
566 # TODO: One broadcast may be split into multiple videos. The key
567 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
568 # starts at 1 and increases. Can we treat all parts as one video?
570 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
572 (?P<channelid>[^/]+)|
573 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
574 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
578 _JUSTIN_PAGE_LIMIT = 100
579 IE_NAME = u'justin.tv'
581 def report_download_page(self, channel, offset):
582 """Report attempt to download a single page of videos."""
583 self.to_screen(u'%s: Downloading video information from %d to %d' %
584 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
586 # Return count of items, list of *valid* items
587 def _parse_page(self, url, video_id):
588 webpage = self._download_webpage(url, video_id,
589 u'Downloading video info JSON',
590 u'unable to download video info JSON')
592 response = json.loads(webpage)
593 if type(response) != list:
594 error_text = response.get('error', 'unknown error')
595 raise ExtractorError(u'Justin.tv API: %s' % error_text)
597 for clip in response:
598 video_url = clip['video_file_url']
600 video_extension = os.path.splitext(video_url)[1][1:]
601 video_date = re.sub('-', '', clip['start_time'][:10])
602 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
603 video_id = clip['id']
604 video_title = clip.get('title', video_id)
608 'title': video_title,
609 'uploader': clip.get('channel_name', video_uploader_id),
610 'uploader_id': video_uploader_id,
611 'upload_date': video_date,
612 'ext': video_extension,
614 return (len(response), info)
616 def _real_extract(self, url):
617 mobj = re.match(self._VALID_URL, url)
619 raise ExtractorError(u'invalid URL: %s' % url)
621 api_base = 'http://api.justin.tv'
623 if mobj.group('channelid'):
625 video_id = mobj.group('channelid')
626 api = api_base + '/channel/archives/%s.json' % video_id
627 elif mobj.group('chapterid'):
628 chapter_id = mobj.group('chapterid')
630 webpage = self._download_webpage(url, chapter_id)
631 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
633 raise ExtractorError(u'Cannot find archive of a chapter')
634 archive_id = m.group(1)
636 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
637 chapter_info_xml = self._download_webpage(api, chapter_id,
638 note=u'Downloading chapter information',
639 errnote=u'Chapter information download failed')
640 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
641 for a in doc.findall('.//archive'):
642 if archive_id == a.find('./id').text:
645 raise ExtractorError(u'Could not find chapter in chapter information')
647 video_url = a.find('./video_file_url').text
648 video_ext = video_url.rpartition('.')[2] or u'flv'
650 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
651 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
652 note='Downloading chapter metadata',
653 errnote='Download of chapter metadata failed')
654 chapter_info = json.loads(chapter_info_json)
656 bracket_start = int(doc.find('.//bracket_start').text)
657 bracket_end = int(doc.find('.//bracket_end').text)
659 # TODO determine start (and probably fix up file)
660 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
661 #video_url += u'?start=' + TODO:start_timestamp
662 # bracket_start is 13290, but we want 51670615
663 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
664 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
667 'id': u'c' + chapter_id,
670 'title': chapter_info['title'],
671 'thumbnail': chapter_info['preview'],
672 'description': chapter_info['description'],
673 'uploader': chapter_info['channel']['display_name'],
674 'uploader_id': chapter_info['channel']['name'],
678 video_id = mobj.group('videoid')
679 api = api_base + '/broadcast/by_archive/%s.json' % video_id
681 self.report_extraction(video_id)
685 limit = self._JUSTIN_PAGE_LIMIT
688 self.report_download_page(video_id, offset)
689 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
690 page_count, page_info = self._parse_page(page_url, video_id)
691 info.extend(page_info)
692 if not paged or page_count != limit:
697 class FunnyOrDieIE(InfoExtractor):
698 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
700 def _real_extract(self, url):
701 mobj = re.match(self._VALID_URL, url)
703 raise ExtractorError(u'invalid URL: %s' % url)
705 video_id = mobj.group('id')
706 webpage = self._download_webpage(url, video_id)
708 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
709 webpage, u'video URL', flags=re.DOTALL)
711 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
712 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
714 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
715 webpage, u'description', fatal=False, flags=re.DOTALL)
722 'description': video_description,
726 class SteamIE(InfoExtractor):
727 _VALID_URL = r"""http://store\.steampowered\.com/
729 (?P<urltype>video|app)/ #If the page is only for videos or for a game
731 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
733 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
734 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
737 def suitable(cls, url):
738 """Receives a URL and returns True if suitable for this IE."""
739 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
741 def _real_extract(self, url):
742 m = re.match(self._VALID_URL, url, re.VERBOSE)
743 gameID = m.group('gameID')
745 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
746 webpage = self._download_webpage(videourl, gameID)
748 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
749 videourl = self._AGECHECK_TEMPLATE % gameID
750 self.report_age_confirmation()
751 webpage = self._download_webpage(videourl, gameID)
753 self.report_extraction(gameID)
754 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
755 webpage, 'game title')
757 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
758 mweb = re.finditer(urlRE, webpage)
759 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
760 titles = re.finditer(namesRE, webpage)
761 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
762 thumbs = re.finditer(thumbsRE, webpage)
764 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
765 video_id = vid.group('videoID')
766 title = vtitle.group('videoName')
767 video_url = vid.group('videoURL')
768 video_thumb = thumb.group('thumbnail')
770 raise ExtractorError(u'Cannot find video url for %s' % video_id)
775 'title': unescapeHTML(title),
776 'thumbnail': video_thumb
779 return [self.playlist_result(videos, gameID, game_title)]
781 class UstreamIE(InfoExtractor):
782 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
785 def _real_extract(self, url):
786 m = re.match(self._VALID_URL, url)
787 video_id = m.group('videoID')
789 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
790 webpage = self._download_webpage(url, video_id)
792 self.report_extraction(video_id)
794 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
797 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
798 webpage, u'uploader', fatal=False, flags=re.DOTALL)
800 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
801 webpage, u'thumbnail', fatal=False)
807 'title': video_title,
808 'uploader': uploader,
809 'thumbnail': thumbnail,
813 class WorldStarHipHopIE(InfoExtractor):
814 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
815 IE_NAME = u'WorldStarHipHop'
817 def _real_extract(self, url):
818 m = re.match(self._VALID_URL, url)
819 video_id = m.group('id')
821 webpage_src = self._download_webpage(url, video_id)
823 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
824 webpage_src, u'video URL')
826 if 'mp4' in video_url:
831 video_title = self._html_search_regex(r"<title>(.*)</title>",
832 webpage_src, u'title')
834 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
835 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
836 webpage_src, u'thumbnail', fatal=False)
839 _title = r"""candytitles.*>(.*)</span>"""
840 mobj = re.search(_title, webpage_src)
842 video_title = mobj.group(1)
847 'title' : video_title,
848 'thumbnail' : thumbnail,
853 class RBMARadioIE(InfoExtractor):
854 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
856 def _real_extract(self, url):
857 m = re.match(self._VALID_URL, url)
858 video_id = m.group('videoID')
860 webpage = self._download_webpage(url, video_id)
862 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
863 webpage, u'json data', flags=re.MULTILINE)
866 data = json.loads(json_data)
867 except ValueError as e:
868 raise ExtractorError(u'Invalid JSON: ' + str(e))
870 video_url = data['akamai_url'] + '&cbr=256'
871 url_parts = compat_urllib_parse_urlparse(video_url)
872 video_ext = url_parts.path.rpartition('.')[2]
877 'title': data['title'],
878 'description': data.get('teaser_text'),
879 'location': data.get('country_of_origin'),
880 'uploader': data.get('host', {}).get('name'),
881 'uploader_id': data.get('host', {}).get('slug'),
882 'thumbnail': data.get('image', {}).get('large_url_2x'),
883 'duration': data.get('duration'),
888 class YouPornIE(InfoExtractor):
889 """Information extractor for youporn.com."""
890 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
892 def _print_formats(self, formats):
893 """Print all available formats"""
894 print(u'Available formats:')
895 print(u'ext\t\tformat')
896 print(u'---------------------------------')
897 for format in formats:
898 print(u'%s\t\t%s' % (format['ext'], format['format']))
900 def _specific(self, req_format, formats):
902 if(x["format"]==req_format):
906 def _real_extract(self, url):
907 mobj = re.match(self._VALID_URL, url)
909 raise ExtractorError(u'Invalid URL: %s' % url)
910 video_id = mobj.group('videoid')
912 req = compat_urllib_request.Request(url)
913 req.add_header('Cookie', 'age_verified=1')
914 webpage = self._download_webpage(req, video_id)
916 # Get JSON parameters
917 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
919 params = json.loads(json_params)
921 raise ExtractorError(u'Invalid JSON')
923 self.report_extraction(video_id)
925 video_title = params['title']
926 upload_date = unified_strdate(params['release_date_f'])
927 video_description = params['description']
928 video_uploader = params['submitted_by']
929 thumbnail = params['thumbnails'][0]['image']
931 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
933 # Get all of the formats available
934 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
935 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
936 webpage, u'download list').strip()
938 # Get all of the links from the page
939 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
940 links = re.findall(LINK_RE, download_list_html)
942 raise ExtractorError(u'ERROR: no known formats available for video')
944 self.to_screen(u'Links found: %d' % len(links))
949 # A link looks like this:
950 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
951 # A path looks like this:
952 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
953 video_url = unescapeHTML( link )
954 path = compat_urllib_parse_urlparse( video_url ).path
955 extension = os.path.splitext( path )[1][1:]
956 format = path.split('/')[4].split('_')[:2]
959 format = "-".join( format )
960 # title = u'%s-%s-%s' % (video_title, size, bitrate)
965 'uploader': video_uploader,
966 'upload_date': upload_date,
967 'title': video_title,
970 'thumbnail': thumbnail,
971 'description': video_description
974 if self._downloader.params.get('listformats', None):
975 self._print_formats(formats)
978 req_format = self._downloader.params.get('format', None)
979 self.to_screen(u'Format: %s' % req_format)
981 if req_format is None or req_format == 'best':
983 elif req_format == 'worst':
985 elif req_format in ('-1', 'all'):
988 format = self._specific( req_format, formats )
990 raise ExtractorError(u'Requested format not available')
995 class PornotubeIE(InfoExtractor):
996 """Information extractor for pornotube.com."""
997 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
999 def _real_extract(self, url):
1000 mobj = re.match(self._VALID_URL, url)
1002 raise ExtractorError(u'Invalid URL: %s' % url)
1004 video_id = mobj.group('videoid')
1005 video_title = mobj.group('title')
1007 # Get webpage content
1008 webpage = self._download_webpage(url, video_id)
1011 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1012 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1013 video_url = compat_urllib_parse.unquote(video_url)
1015 #Get the uploaded date
1016 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1017 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1018 if upload_date: upload_date = unified_strdate(upload_date)
1020 info = {'id': video_id,
1023 'upload_date': upload_date,
1024 'title': video_title,
1030 class YouJizzIE(InfoExtractor):
1031 """Information extractor for youjizz.com."""
1032 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1034 def _real_extract(self, url):
1035 mobj = re.match(self._VALID_URL, url)
1037 raise ExtractorError(u'Invalid URL: %s' % url)
1039 video_id = mobj.group('videoid')
1041 # Get webpage content
1042 webpage = self._download_webpage(url, video_id)
1044 # Get the video title
1045 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1046 webpage, u'title').strip()
1048 # Get the embed page
1049 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1051 raise ExtractorError(u'ERROR: unable to extract embed page')
1053 embed_page_url = result.group(0).strip()
1054 video_id = result.group('videoid')
1056 webpage = self._download_webpage(embed_page_url, video_id)
1059 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1060 webpage, u'video URL')
1062 info = {'id': video_id,
1064 'title': video_title,
1067 'player_url': embed_page_url}
1071 class EightTracksIE(InfoExtractor):
1073 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1075 def _real_extract(self, url):
1076 mobj = re.match(self._VALID_URL, url)
1078 raise ExtractorError(u'Invalid URL: %s' % url)
1079 playlist_id = mobj.group('id')
1081 webpage = self._download_webpage(url, playlist_id)
1083 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1084 data = json.loads(json_like)
1086 session = str(random.randint(0, 1000000000))
1088 track_count = data['tracks_count']
1089 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1090 next_url = first_url
1092 for i in itertools.count():
1093 api_json = self._download_webpage(next_url, playlist_id,
1094 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1095 errnote=u'Failed to download song information')
1096 api_data = json.loads(api_json)
1097 track_data = api_data[u'set']['track']
1099 'id': track_data['id'],
1100 'url': track_data['track_file_stream_url'],
1101 'title': track_data['performer'] + u' - ' + track_data['name'],
1102 'raw_title': track_data['name'],
1103 'uploader_id': data['user']['login'],
1107 if api_data['set']['at_last_track']:
1109 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1112 class KeekIE(InfoExtractor):
1113 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1116 def _real_extract(self, url):
1117 m = re.match(self._VALID_URL, url)
1118 video_id = m.group('videoID')
1120 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1121 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1122 webpage = self._download_webpage(url, video_id)
1124 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1127 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1128 webpage, u'uploader', fatal=False)
1134 'title': video_title,
1135 'thumbnail': thumbnail,
1136 'uploader': uploader
1140 class TEDIE(InfoExtractor):
1141 _VALID_URL=r'''http://www\.ted\.com/
1143 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1145 ((?P<type_talk>talks)) # We have a simple talk
1147 (/lang/(.*?))? # The url may contain the language
1148 /(?P<name>\w+) # Here goes the name and then ".html"
1152 def suitable(cls, url):
1153 """Receives a URL and returns True if suitable for this IE."""
1154 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1156 def _real_extract(self, url):
1157 m=re.match(self._VALID_URL, url, re.VERBOSE)
1158 if m.group('type_talk'):
1159 return [self._talk_info(url)]
1161 playlist_id=m.group('playlist_id')
1162 name=m.group('name')
1163 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1164 return [self._playlist_videos_info(url,name,playlist_id)]
1166 def _playlist_videos_info(self,url,name,playlist_id=0):
1167 '''Returns the videos of the playlist'''
1169 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1170 ([.\s]*?)data-playlist_item_id="(\d+)"
1171 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1173 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1174 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1175 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1176 m_names=re.finditer(video_name_RE,webpage)
1178 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1179 webpage, 'playlist title')
1181 playlist_entries = []
1182 for m_video, m_name in zip(m_videos,m_names):
1183 video_id=m_video.group('video_id')
1184 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1185 playlist_entries.append(self.url_result(talk_url, 'TED'))
1186 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1188 def _talk_info(self, url, video_id=0):
1189 """Return the video for the talk in the url"""
1190 m = re.match(self._VALID_URL, url,re.VERBOSE)
1191 video_name = m.group('name')
1192 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1193 self.report_extraction(video_name)
1194 # If the url includes the language we get the title translated
1195 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1197 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1198 webpage, 'json data')
1199 info = json.loads(json_data)
1200 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1201 webpage, 'description', flags = re.DOTALL)
1203 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1204 webpage, 'thumbnail')
1207 'url': info['htmlStreams'][-1]['file'],
1210 'thumbnail': thumbnail,
1211 'description': desc,
1215 class MySpassIE(InfoExtractor):
1216 _VALID_URL = r'http://www.myspass.de/.*'
1218 def _real_extract(self, url):
1219 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1221 # video id is the last path element of the URL
1222 # usually there is a trailing slash, so also try the second but last
1223 url_path = compat_urllib_parse_urlparse(url).path
1224 url_parent_path, video_id = os.path.split(url_path)
1226 _, video_id = os.path.split(url_parent_path)
1229 metadata_url = META_DATA_URL_TEMPLATE % video_id
1230 metadata_text = self._download_webpage(metadata_url, video_id)
1231 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1233 # extract values from metadata
1234 url_flv_el = metadata.find('url_flv')
1235 if url_flv_el is None:
1236 raise ExtractorError(u'Unable to extract download url')
1237 video_url = url_flv_el.text
1238 extension = os.path.splitext(video_url)[1][1:]
1239 title_el = metadata.find('title')
1240 if title_el is None:
1241 raise ExtractorError(u'Unable to extract title')
1242 title = title_el.text
1243 format_id_el = metadata.find('format_id')
1244 if format_id_el is None:
1247 format = format_id_el.text
1248 description_el = metadata.find('description')
1249 if description_el is not None:
1250 description = description_el.text
1253 imagePreview_el = metadata.find('imagePreview')
1254 if imagePreview_el is not None:
1255 thumbnail = imagePreview_el.text
1264 'thumbnail': thumbnail,
1265 'description': description
1269 class SpiegelIE(InfoExtractor):
1270 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1272 def _real_extract(self, url):
1273 m = re.match(self._VALID_URL, url)
1274 video_id = m.group('videoID')
1276 webpage = self._download_webpage(url, video_id)
1278 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1281 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1282 xml_code = self._download_webpage(xml_url, video_id,
1283 note=u'Downloading XML', errnote=u'Failed to download XML')
1285 idoc = xml.etree.ElementTree.fromstring(xml_code)
1286 last_type = idoc[-1]
1287 filename = last_type.findall('./filename')[0].text
1288 duration = float(last_type.findall('./duration')[0].text)
1290 video_url = 'http://video2.spiegel.de/flash/' + filename
1291 video_ext = filename.rpartition('.')[2]
1296 'title': video_title,
1297 'duration': duration,
1301 class LiveLeakIE(InfoExtractor):
1303 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1304 IE_NAME = u'liveleak'
1306 def _real_extract(self, url):
1307 mobj = re.match(self._VALID_URL, url)
1309 raise ExtractorError(u'Invalid URL: %s' % url)
1311 video_id = mobj.group('video_id')
1313 webpage = self._download_webpage(url, video_id)
1315 video_url = self._search_regex(r'file: "(.*?)",',
1316 webpage, u'video URL')
1318 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1319 webpage, u'title').replace('LiveLeak.com -', '').strip()
1321 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1322 webpage, u'description', fatal=False)
1324 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1325 webpage, u'uploader', fatal=False)
1331 'title': video_title,
1332 'description': video_description,
1333 'uploader': video_uploader
1340 class TumblrIE(InfoExtractor):
1341 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1343 def _real_extract(self, url):
1344 m_url = re.match(self._VALID_URL, url)
1345 video_id = m_url.group('id')
1346 blog = m_url.group('blog_name')
1348 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1349 webpage = self._download_webpage(url, video_id)
1351 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1352 video = re.search(re_video, webpage)
1354 raise ExtractorError(u'Unable to extract video')
1355 video_url = video.group('video_url')
1356 ext = video.group('ext')
1358 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1359 webpage, u'thumbnail', fatal=False) # We pick the first poster
1360 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1362 # The only place where you can get a title, it's not complete,
1363 # but searching in other places doesn't work for all videos
1364 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1365 webpage, u'title', flags=re.DOTALL)
1367 return [{'id': video_id,
1369 'title': video_title,
1370 'thumbnail': video_thumbnail,
1374 class BandcampIE(InfoExtractor):
1375 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1377 def _real_extract(self, url):
1378 mobj = re.match(self._VALID_URL, url)
1379 title = mobj.group('title')
1380 webpage = self._download_webpage(url, title)
1381 # We get the link to the free download page
1382 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1383 if m_download is None:
1384 raise ExtractorError(u'No free songs found')
1386 download_link = m_download.group(1)
1387 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1388 webpage, re.MULTILINE|re.DOTALL).group('id')
1390 download_webpage = self._download_webpage(download_link, id,
1391 'Downloading free downloads page')
1392 # We get the dictionary of the track from some javascrip code
1393 info = re.search(r'items: (.*?),$',
1394 download_webpage, re.MULTILINE).group(1)
1395 info = json.loads(info)[0]
1396 # We pick mp3-320 for now, until format selection can be easily implemented.
1397 mp3_info = info[u'downloads'][u'mp3-320']
1398 # If we try to use this url it says the link has expired
1399 initial_url = mp3_info[u'url']
1400 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1401 m_url = re.match(re_url, initial_url)
1402 #We build the url we will use to get the final track url
1403 # This url is build in Bandcamp in the script download_bunde_*.js
1404 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1405 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1406 # If we could correctly generate the .rand field the url would be
1407 #in the "download_url" key
1408 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1410 track_info = {'id':id,
1411 'title' : info[u'title'],
1414 'thumbnail' : info[u'thumb_url'],
1415 'uploader' : info[u'artist']
1420 class RedTubeIE(InfoExtractor):
1421 """Information Extractor for redtube"""
1422 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1424 def _real_extract(self,url):
1425 mobj = re.match(self._VALID_URL, url)
1427 raise ExtractorError(u'Invalid URL: %s' % url)
1429 video_id = mobj.group('id')
1430 video_extension = 'mp4'
1431 webpage = self._download_webpage(url, video_id)
1433 self.report_extraction(video_id)
1435 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1436 webpage, u'video URL')
1438 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1444 'ext': video_extension,
1445 'title': video_title,
1448 class InaIE(InfoExtractor):
1449 """Information Extractor for Ina.fr"""
1450 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1452 def _real_extract(self,url):
1453 mobj = re.match(self._VALID_URL, url)
1455 video_id = mobj.group('id')
1456 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1457 video_extension = 'mp4'
1458 webpage = self._download_webpage(mrss_url, video_id)
1460 self.report_extraction(video_id)
1462 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1463 webpage, u'video URL')
1465 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1471 'ext': video_extension,
1472 'title': video_title,
1475 class HowcastIE(InfoExtractor):
1476 """Information Extractor for Howcast.com"""
1477 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1479 def _real_extract(self, url):
1480 mobj = re.match(self._VALID_URL, url)
1482 video_id = mobj.group('id')
1483 webpage_url = 'http://www.howcast.com/videos/' + video_id
1484 webpage = self._download_webpage(webpage_url, video_id)
1486 self.report_extraction(video_id)
1488 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1489 webpage, u'video URL')
1491 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1494 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1495 webpage, u'description', fatal=False)
1497 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1498 webpage, u'thumbnail', fatal=False)
1504 'title': video_title,
1505 'description': video_description,
1506 'thumbnail': thumbnail,
1509 class VineIE(InfoExtractor):
1510 """Information Extractor for Vine.co"""
1511 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1513 def _real_extract(self, url):
1514 mobj = re.match(self._VALID_URL, url)
1516 video_id = mobj.group('id')
1517 webpage_url = 'https://vine.co/v/' + video_id
1518 webpage = self._download_webpage(webpage_url, video_id)
1520 self.report_extraction(video_id)
1522 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1523 webpage, u'video URL')
1525 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1528 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1529 webpage, u'thumbnail', fatal=False)
1531 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1532 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1538 'title': video_title,
1539 'thumbnail': thumbnail,
1540 'uploader': uploader,
1543 class FlickrIE(InfoExtractor):
1544 """Information Extractor for Flickr videos"""
1545 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1547 def _real_extract(self, url):
1548 mobj = re.match(self._VALID_URL, url)
1550 video_id = mobj.group('id')
1551 video_uploader_id = mobj.group('uploader_id')
1552 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1553 webpage = self._download_webpage(webpage_url, video_id)
1555 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1557 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1558 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1560 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1561 first_xml, u'node_id')
1563 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1564 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1566 self.report_extraction(video_id)
1568 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1570 raise ExtractorError(u'Unable to extract video url')
1571 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1573 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1574 webpage, u'video title')
1576 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1577 webpage, u'description', fatal=False)
1579 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1580 webpage, u'thumbnail', fatal=False)
1586 'title': video_title,
1587 'description': video_description,
1588 'thumbnail': thumbnail,
1589 'uploader_id': video_uploader_id,
1592 class TeamcocoIE(InfoExtractor):
1593 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1595 def _real_extract(self, url):
1596 mobj = re.match(self._VALID_URL, url)
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1599 url_title = mobj.group('url_title')
1600 webpage = self._download_webpage(url, url_title)
1602 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1603 webpage, u'video id')
1605 self.report_extraction(video_id)
1607 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1610 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1611 webpage, u'thumbnail', fatal=False)
1613 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1614 webpage, u'description', fatal=False)
1616 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1617 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1619 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1626 'title': video_title,
1627 'thumbnail': thumbnail,
1628 'description': video_description,
1631 class XHamsterIE(InfoExtractor):
1632 """Information Extractor for xHamster"""
1633 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1635 def _real_extract(self,url):
1636 mobj = re.match(self._VALID_URL, url)
1638 video_id = mobj.group('id')
1639 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1640 webpage = self._download_webpage(mrss_url, video_id)
1642 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1644 raise ExtractorError(u'Unable to extract media URL')
1645 if len(mobj.group('server')) == 0:
1646 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1648 video_url = mobj.group('server')+'/key='+mobj.group('file')
1649 video_extension = video_url.split('.')[-1]
1651 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1654 # Can't see the description anywhere in the UI
1655 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1656 # webpage, u'description', fatal=False)
1657 # if video_description: video_description = unescapeHTML(video_description)
1659 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1661 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1663 video_upload_date = None
1664 self._downloader.report_warning(u'Unable to extract upload date')
1666 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1667 webpage, u'uploader id', default=u'anonymous')
1669 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1670 webpage, u'thumbnail', fatal=False)
1675 'ext': video_extension,
1676 'title': video_title,
1677 # 'description': video_description,
1678 'upload_date': video_upload_date,
1679 'uploader_id': video_uploader_id,
1680 'thumbnail': video_thumbnail
1683 class HypemIE(InfoExtractor):
1684 """Information Extractor for hypem"""
1685 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1687 def _real_extract(self, url):
1688 mobj = re.match(self._VALID_URL, url)
1690 raise ExtractorError(u'Invalid URL: %s' % url)
1691 track_id = mobj.group(1)
1693 data = { 'ax': 1, 'ts': time.time() }
1694 data_encoded = compat_urllib_parse.urlencode(data)
1695 complete_url = url + "?" + data_encoded
1696 request = compat_urllib_request.Request(complete_url)
1697 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1698 cookie = urlh.headers.get('Set-Cookie', '')
1700 self.report_extraction(track_id)
1702 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1703 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1705 track_list = json.loads(html_tracks)
1706 track = track_list[u'tracks'][0]
1708 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1711 track_id = track[u"id"]
1712 artist = track[u"artist"]
1713 title = track[u"song"]
1715 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1716 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1717 request.add_header('cookie', cookie)
1718 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1720 song_data = json.loads(song_data_json)
1722 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1723 final_url = song_data[u"url"]
1733 class Vbox7IE(InfoExtractor):
1734 """Information Extractor for Vbox7"""
1735 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1737 def _real_extract(self,url):
1738 mobj = re.match(self._VALID_URL, url)
1740 raise ExtractorError(u'Invalid URL: %s' % url)
1741 video_id = mobj.group(1)
1743 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1744 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1745 redirect_url = urlh.geturl() + new_location
1746 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1748 title = self._html_search_regex(r'<title>(.*)</title>',
1749 webpage, u'title').split('/')[0].strip()
1752 info_url = "http://vbox7.com/play/magare.do"
1753 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1754 info_request = compat_urllib_request.Request(info_url, data)
1755 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1756 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1757 if info_response is None:
1758 raise ExtractorError(u'Unable to extract the media url')
1759 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1766 'thumbnail': thumbnail_url,
1770 def gen_extractors():
1771 """ Return a list of an instance of every supported extractor.
1772 The order does matter; the first extractor matched is the one handling the URL.
1775 YoutubePlaylistIE(),
1800 StanfordOpenClassroomIE(),
1810 WorldStarHipHopIE(),
1840 def get_info_extractor(ie_name):
1841 """Returns the info extractor class with the given ie_name"""
1842 return globals()[ie_name+'IE']