10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.googleplus import GooglePlusIE
29 from .extractor.googlesearch import GoogleSearchIE
30 from .extractor.metacafe import MetacafeIE
31 from .extractor.myvideo import MyVideoIE
32 from .extractor.statigram import StatigramIE
33 from .extractor.photobucket import PhotobucketIE
34 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
35 from .extractor.vimeo import VimeoIE
36 from .extractor.yahoo import YahooIE, YahooSearchIE
37 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
38 from .extractor.zdf import ZDFIE
58 class DepositFilesIE(InfoExtractor):
59 """Information extractor for depositfiles.com"""
61 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
63 def _real_extract(self, url):
64 file_id = url.split('/')[-1]
65 # Rebuild url in english locale
66 url = 'http://depositfiles.com/en/files/' + file_id
68 # Retrieve file webpage with 'Free download' button pressed
69 free_download_indication = { 'gateway_result' : '1' }
70 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
72 self.report_download_webpage(file_id)
73 webpage = compat_urllib_request.urlopen(request).read()
74 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
75 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
77 # Search for the real file URL
78 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
79 if (mobj is None) or (mobj.group(1) is None):
80 # Try to figure out reason of the error.
81 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
82 if (mobj is not None) and (mobj.group(1) is not None):
83 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
84 raise ExtractorError(u'%s' % restriction_message)
86 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
88 file_url = mobj.group(1)
89 file_extension = os.path.splitext(file_url)[1][1:]
91 # Search for file title
92 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
95 'id': file_id.decode('utf-8'),
96 'url': file_url.decode('utf-8'),
100 'ext': file_extension.decode('utf-8'),
104 class FacebookIE(InfoExtractor):
105 """Information Extractor for Facebook"""
107 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
108 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
109 _NETRC_MACHINE = 'facebook'
110 IE_NAME = u'facebook'
112 def report_login(self):
113 """Report attempt to log in."""
114 self.to_screen(u'Logging in')
116 def _real_initialize(self):
117 if self._downloader is None:
122 downloader_params = self._downloader.params
124 # Attempt to use provided username and password or .netrc data
125 if downloader_params.get('username', None) is not None:
126 useremail = downloader_params['username']
127 password = downloader_params['password']
128 elif downloader_params.get('usenetrc', False):
130 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
135 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
136 except (IOError, netrc.NetrcParseError) as err:
137 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
140 if useremail is None:
149 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
152 login_results = compat_urllib_request.urlopen(request).read()
153 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
154 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
157 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
160 def _real_extract(self, url):
161 mobj = re.match(self._VALID_URL, url)
163 raise ExtractorError(u'Invalid URL: %s' % url)
164 video_id = mobj.group('ID')
166 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
167 webpage = self._download_webpage(url, video_id)
169 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
170 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
171 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
173 raise ExtractorError(u'Cannot parse data')
174 data = dict(json.loads(m.group(1)))
175 params_raw = compat_urllib_parse.unquote(data['params'])
176 params = json.loads(params_raw)
177 video_data = params['video_data'][0]
178 video_url = video_data.get('hd_src')
180 video_url = video_data['sd_src']
182 raise ExtractorError(u'Cannot find video URL')
183 video_duration = int(video_data['video_duration'])
184 thumbnail = video_data['thumbnail_src']
186 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
191 'title': video_title,
194 'duration': video_duration,
195 'thumbnail': thumbnail,
205 class EscapistIE(InfoExtractor):
206 """Information extractor for The Escapist """
208 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
209 IE_NAME = u'escapist'
211 def _real_extract(self, url):
212 mobj = re.match(self._VALID_URL, url)
214 raise ExtractorError(u'Invalid URL: %s' % url)
215 showName = mobj.group('showname')
216 videoId = mobj.group('episode')
218 self.report_extraction(videoId)
219 webpage = self._download_webpage(url, videoId)
221 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
222 webpage, u'description', fatal=False)
224 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
225 webpage, u'thumbnail', fatal=False)
227 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
228 webpage, u'player url')
230 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
231 webpage, u'player url').split(' : ')[-1]
233 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
234 configUrl = compat_urllib_parse.unquote(configUrl)
236 configJSON = self._download_webpage(configUrl, videoId,
237 u'Downloading configuration',
238 u'unable to download configuration')
240 # Technically, it's JavaScript, not JSON
241 configJSON = configJSON.replace("'", '"')
244 config = json.loads(configJSON)
245 except (ValueError,) as err:
246 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
248 playlist = config['playlist']
249 videoUrl = playlist[1]['url']
254 'uploader': showName,
259 'description': videoDesc,
260 'player_url': playerUrl,
265 class CollegeHumorIE(InfoExtractor):
266 """Information extractor for collegehumor.com"""
269 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
270 IE_NAME = u'collegehumor'
272 def report_manifest(self, video_id):
273 """Report information extraction."""
274 self.to_screen(u'%s: Downloading XML manifest' % video_id)
276 def _real_extract(self, url):
277 mobj = re.match(self._VALID_URL, url)
279 raise ExtractorError(u'Invalid URL: %s' % url)
280 video_id = mobj.group('videoid')
288 self.report_extraction(video_id)
289 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
291 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
292 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
293 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
295 mdoc = xml.etree.ElementTree.fromstring(metaXml)
297 videoNode = mdoc.findall('./video')[0]
298 info['description'] = videoNode.findall('./description')[0].text
299 info['title'] = videoNode.findall('./caption')[0].text
300 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
301 manifest_url = videoNode.findall('./file')[0].text
303 raise ExtractorError(u'Invalid metadata XML file')
305 manifest_url += '?hdcore=2.10.3'
306 self.report_manifest(video_id)
308 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
312 adoc = xml.etree.ElementTree.fromstring(manifestXml)
314 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
315 node_id = media_node.attrib['url']
316 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
317 except IndexError as err:
318 raise ExtractorError(u'Invalid manifest file')
320 url_pr = compat_urllib_parse_urlparse(manifest_url)
321 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
328 class XVideosIE(InfoExtractor):
329 """Information extractor for xvideos.com"""
331 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
334 def _real_extract(self, url):
335 mobj = re.match(self._VALID_URL, url)
337 raise ExtractorError(u'Invalid URL: %s' % url)
338 video_id = mobj.group(1)
340 webpage = self._download_webpage(url, video_id)
342 self.report_extraction(video_id)
345 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
346 webpage, u'video URL'))
349 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
352 # Extract video thumbnail
353 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
354 webpage, u'thumbnail', fatal=False)
361 'title': video_title,
363 'thumbnail': video_thumbnail,
372 class InfoQIE(InfoExtractor):
373 """Information extractor for infoq.com"""
374 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
376 def _real_extract(self, url):
377 mobj = re.match(self._VALID_URL, url)
379 raise ExtractorError(u'Invalid URL: %s' % url)
381 webpage = self._download_webpage(url, video_id=url)
382 self.report_extraction(url)
385 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
387 raise ExtractorError(u'Unable to extract video url')
388 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
389 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
392 video_title = self._search_regex(r'contentTitle = "(.*?)";',
395 # Extract description
396 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
397 webpage, u'description', fatal=False)
399 video_filename = video_url.split('/')[-1]
400 video_id, extension = video_filename.split('.')
407 'title': video_title,
408 'ext': extension, # Extension is always(?) mp4, but seems to be flv
410 'description': video_description,
415 class MixcloudIE(InfoExtractor):
416 """Information extractor for www.mixcloud.com"""
418 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
419 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
420 IE_NAME = u'mixcloud'
422 def report_download_json(self, file_id):
423 """Report JSON download."""
424 self.to_screen(u'Downloading json')
426 def get_urls(self, jsonData, fmt, bitrate='best'):
427 """Get urls from 'audio_formats' section in json"""
430 bitrate_list = jsonData[fmt]
431 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
432 bitrate = max(bitrate_list) # select highest
434 url_list = jsonData[fmt][bitrate]
435 except TypeError: # we have no bitrate info.
436 url_list = jsonData[fmt]
439 def check_urls(self, url_list):
440 """Returns 1st active url from list"""
443 compat_urllib_request.urlopen(url)
445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
450 def _print_formats(self, formats):
451 print('Available formats:')
452 for fmt in formats.keys():
453 for b in formats[fmt]:
455 ext = formats[fmt][b][0]
456 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
457 except TypeError: # we have no bitrate info
458 ext = formats[fmt][0]
459 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
462 def _real_extract(self, url):
463 mobj = re.match(self._VALID_URL, url)
465 raise ExtractorError(u'Invalid URL: %s' % url)
466 # extract uploader & filename from url
467 uploader = mobj.group(1).decode('utf-8')
468 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
470 # construct API request
471 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
472 # retrieve .json file with links to files
473 request = compat_urllib_request.Request(file_url)
475 self.report_download_json(file_url)
476 jsonData = compat_urllib_request.urlopen(request).read()
477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
478 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
481 json_data = json.loads(jsonData)
482 player_url = json_data['player_swf_url']
483 formats = dict(json_data['audio_formats'])
485 req_format = self._downloader.params.get('format', None)
488 if self._downloader.params.get('listformats', None):
489 self._print_formats(formats)
492 if req_format is None or req_format == 'best':
493 for format_param in formats.keys():
494 url_list = self.get_urls(formats, format_param)
496 file_url = self.check_urls(url_list)
497 if file_url is not None:
500 if req_format not in formats:
501 raise ExtractorError(u'Format is not available')
503 url_list = self.get_urls(formats, req_format)
504 file_url = self.check_urls(url_list)
505 format_param = req_format
508 'id': file_id.decode('utf-8'),
509 'url': file_url.decode('utf-8'),
510 'uploader': uploader.decode('utf-8'),
512 'title': json_data['name'],
513 'ext': file_url.split('.')[-1].decode('utf-8'),
514 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
515 'thumbnail': json_data['thumbnail_url'],
516 'description': json_data['description'],
517 'player_url': player_url.decode('utf-8'),
520 class StanfordOpenClassroomIE(InfoExtractor):
521 """Information extractor for Stanford's Open ClassRoom"""
523 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
524 IE_NAME = u'stanfordoc'
526 def _real_extract(self, url):
527 mobj = re.match(self._VALID_URL, url)
529 raise ExtractorError(u'Invalid URL: %s' % url)
531 if mobj.group('course') and mobj.group('video'): # A specific video
532 course = mobj.group('course')
533 video = mobj.group('video')
535 'id': course + '_' + video,
540 self.report_extraction(info['id'])
541 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
542 xmlUrl = baseUrl + video + '.xml'
544 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
546 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
547 mdoc = xml.etree.ElementTree.fromstring(metaXml)
549 info['title'] = mdoc.findall('./title')[0].text
550 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
552 raise ExtractorError(u'Invalid metadata XML file')
553 info['ext'] = info['url'].rpartition('.')[2]
555 elif mobj.group('course'): # A course page
556 course = mobj.group('course')
564 coursepage = self._download_webpage(url, info['id'],
565 note='Downloading course info page',
566 errnote='Unable to download course info page')
568 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
570 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
571 coursepage, u'description', fatal=False)
573 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
577 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
581 for entry in info['list']:
582 assert entry['type'] == 'reference'
583 results += self.extract(entry['url'])
587 'id': 'Stanford OpenClassroom',
593 self.report_download_webpage(info['id'])
594 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
596 rootpage = compat_urllib_request.urlopen(rootURL).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
600 info['title'] = info['id']
602 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
606 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
611 for entry in info['list']:
612 assert entry['type'] == 'reference'
613 results += self.extract(entry['url'])
616 class MTVIE(InfoExtractor):
617 """Information extractor for MTV.com"""
619 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
622 def _real_extract(self, url):
623 mobj = re.match(self._VALID_URL, url)
625 raise ExtractorError(u'Invalid URL: %s' % url)
626 if not mobj.group('proto'):
627 url = 'http://' + url
628 video_id = mobj.group('videoid')
630 webpage = self._download_webpage(url, video_id)
632 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
633 webpage, u'song name', fatal=False)
635 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
638 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
639 webpage, u'mtvn_uri', fatal=False)
641 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
642 webpage, u'content id', fatal=False)
644 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
645 self.report_extraction(video_id)
646 request = compat_urllib_request.Request(videogen_url)
648 metadataXml = compat_urllib_request.urlopen(request).read()
649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
650 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
652 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
653 renditions = mdoc.findall('.//rendition')
655 # For now, always pick the highest quality.
656 rendition = renditions[-1]
659 _,_,ext = rendition.attrib['type'].partition('/')
660 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
661 video_url = rendition.find('./src').text
663 raise ExtractorError('Invalid rendition field.')
668 'uploader': performer,
670 'title': video_title,
678 class YoukuIE(InfoExtractor):
679 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
682 nowTime = int(time.time() * 1000)
683 random1 = random.randint(1000,1998)
684 random2 = random.randint(1000,9999)
686 return "%d%d%d" %(nowTime,random1,random2)
688 def _get_file_ID_mix_string(self, seed):
690 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
692 for i in range(len(source)):
693 seed = (seed * 211 + 30031 ) % 65536
694 index = math.floor(seed / 65536 * len(source) )
695 mixed.append(source[int(index)])
696 source.remove(source[int(index)])
697 #return ''.join(mixed)
700 def _get_file_id(self, fileId, seed):
701 mixed = self._get_file_ID_mix_string(seed)
702 ids = fileId.split('*')
706 realId.append(mixed[int(ch)])
707 return ''.join(realId)
709 def _real_extract(self, url):
710 mobj = re.match(self._VALID_URL, url)
712 raise ExtractorError(u'Invalid URL: %s' % url)
713 video_id = mobj.group('ID')
715 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
717 jsondata = self._download_webpage(info_url, video_id)
719 self.report_extraction(video_id)
721 config = json.loads(jsondata)
723 video_title = config['data'][0]['title']
724 seed = config['data'][0]['seed']
726 format = self._downloader.params.get('format', None)
727 supported_format = list(config['data'][0]['streamfileids'].keys())
729 if format is None or format == 'best':
730 if 'hd2' in supported_format:
735 elif format == 'worst':
743 fileid = config['data'][0]['streamfileids'][format]
744 keys = [s['k'] for s in config['data'][0]['segs'][format]]
745 except (UnicodeDecodeError, ValueError, KeyError):
746 raise ExtractorError(u'Unable to extract info section')
749 sid = self._gen_sid()
750 fileid = self._get_file_id(fileid, seed)
752 #column 8,9 of fileid represent the segment number
753 #fileid[7:9] should be changed
754 for index, key in enumerate(keys):
756 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
757 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
760 'id': '%s_part%02d' % (video_id, index),
764 'title': video_title,
767 files_info.append(info)
772 class XNXXIE(InfoExtractor):
773 """Information extractor for xnxx.com"""
775 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
777 VIDEO_URL_RE = r'flv_url=(.*?)&'
778 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
779 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
781 def _real_extract(self, url):
782 mobj = re.match(self._VALID_URL, url)
784 raise ExtractorError(u'Invalid URL: %s' % url)
785 video_id = mobj.group(1)
787 # Get webpage content
788 webpage = self._download_webpage(url, video_id)
790 video_url = self._search_regex(self.VIDEO_URL_RE,
791 webpage, u'video URL')
792 video_url = compat_urllib_parse.unquote(video_url)
794 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
797 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
798 webpage, u'thumbnail', fatal=False)
805 'title': video_title,
807 'thumbnail': video_thumbnail,
813 class NBAIE(InfoExtractor):
814 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
817 def _real_extract(self, url):
818 mobj = re.match(self._VALID_URL, url)
820 raise ExtractorError(u'Invalid URL: %s' % url)
822 video_id = mobj.group(1)
824 webpage = self._download_webpage(url, video_id)
826 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
828 shortened_video_id = video_id.rpartition('/')[2]
829 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
830 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
832 # It isn't there in the HTML it returns to us
833 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
835 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
838 'id': shortened_video_id,
842 # 'uploader_date': uploader_date,
843 'description': description,
847 class JustinTVIE(InfoExtractor):
848 """Information extractor for justin.tv and twitch.tv"""
849 # TODO: One broadcast may be split into multiple videos. The key
850 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
851 # starts at 1 and increases. Can we treat all parts as one video?
853 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
855 (?P<channelid>[^/]+)|
856 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
857 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
861 _JUSTIN_PAGE_LIMIT = 100
862 IE_NAME = u'justin.tv'
864 def report_download_page(self, channel, offset):
865 """Report attempt to download a single page of videos."""
866 self.to_screen(u'%s: Downloading video information from %d to %d' %
867 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
869 # Return count of items, list of *valid* items
870 def _parse_page(self, url, video_id):
871 webpage = self._download_webpage(url, video_id,
872 u'Downloading video info JSON',
873 u'unable to download video info JSON')
875 response = json.loads(webpage)
876 if type(response) != list:
877 error_text = response.get('error', 'unknown error')
878 raise ExtractorError(u'Justin.tv API: %s' % error_text)
880 for clip in response:
881 video_url = clip['video_file_url']
883 video_extension = os.path.splitext(video_url)[1][1:]
884 video_date = re.sub('-', '', clip['start_time'][:10])
885 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
886 video_id = clip['id']
887 video_title = clip.get('title', video_id)
891 'title': video_title,
892 'uploader': clip.get('channel_name', video_uploader_id),
893 'uploader_id': video_uploader_id,
894 'upload_date': video_date,
895 'ext': video_extension,
897 return (len(response), info)
899 def _real_extract(self, url):
900 mobj = re.match(self._VALID_URL, url)
902 raise ExtractorError(u'invalid URL: %s' % url)
904 api_base = 'http://api.justin.tv'
906 if mobj.group('channelid'):
908 video_id = mobj.group('channelid')
909 api = api_base + '/channel/archives/%s.json' % video_id
910 elif mobj.group('chapterid'):
911 chapter_id = mobj.group('chapterid')
913 webpage = self._download_webpage(url, chapter_id)
914 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
916 raise ExtractorError(u'Cannot find archive of a chapter')
917 archive_id = m.group(1)
919 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
920 chapter_info_xml = self._download_webpage(api, chapter_id,
921 note=u'Downloading chapter information',
922 errnote=u'Chapter information download failed')
923 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
924 for a in doc.findall('.//archive'):
925 if archive_id == a.find('./id').text:
928 raise ExtractorError(u'Could not find chapter in chapter information')
930 video_url = a.find('./video_file_url').text
931 video_ext = video_url.rpartition('.')[2] or u'flv'
933 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
934 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
935 note='Downloading chapter metadata',
936 errnote='Download of chapter metadata failed')
937 chapter_info = json.loads(chapter_info_json)
939 bracket_start = int(doc.find('.//bracket_start').text)
940 bracket_end = int(doc.find('.//bracket_end').text)
942 # TODO determine start (and probably fix up file)
943 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
944 #video_url += u'?start=' + TODO:start_timestamp
945 # bracket_start is 13290, but we want 51670615
946 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
947 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
950 'id': u'c' + chapter_id,
953 'title': chapter_info['title'],
954 'thumbnail': chapter_info['preview'],
955 'description': chapter_info['description'],
956 'uploader': chapter_info['channel']['display_name'],
957 'uploader_id': chapter_info['channel']['name'],
961 video_id = mobj.group('videoid')
962 api = api_base + '/broadcast/by_archive/%s.json' % video_id
964 self.report_extraction(video_id)
968 limit = self._JUSTIN_PAGE_LIMIT
971 self.report_download_page(video_id, offset)
972 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
973 page_count, page_info = self._parse_page(page_url, video_id)
974 info.extend(page_info)
975 if not paged or page_count != limit:
980 class FunnyOrDieIE(InfoExtractor):
981 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
983 def _real_extract(self, url):
984 mobj = re.match(self._VALID_URL, url)
986 raise ExtractorError(u'invalid URL: %s' % url)
988 video_id = mobj.group('id')
989 webpage = self._download_webpage(url, video_id)
991 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
992 webpage, u'video URL', flags=re.DOTALL)
994 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
995 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
997 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
998 webpage, u'description', fatal=False, flags=re.DOTALL)
1005 'description': video_description,
1009 class SteamIE(InfoExtractor):
1010 _VALID_URL = r"""http://store\.steampowered\.com/
1012 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1014 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1016 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1017 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1020 def suitable(cls, url):
1021 """Receives a URL and returns True if suitable for this IE."""
1022 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1024 def _real_extract(self, url):
1025 m = re.match(self._VALID_URL, url, re.VERBOSE)
1026 gameID = m.group('gameID')
1028 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1029 webpage = self._download_webpage(videourl, gameID)
1031 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1032 videourl = self._AGECHECK_TEMPLATE % gameID
1033 self.report_age_confirmation()
1034 webpage = self._download_webpage(videourl, gameID)
1036 self.report_extraction(gameID)
1037 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1038 webpage, 'game title')
1040 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1041 mweb = re.finditer(urlRE, webpage)
1042 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1043 titles = re.finditer(namesRE, webpage)
1044 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1045 thumbs = re.finditer(thumbsRE, webpage)
1047 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1048 video_id = vid.group('videoID')
1049 title = vtitle.group('videoName')
1050 video_url = vid.group('videoURL')
1051 video_thumb = thumb.group('thumbnail')
1053 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1058 'title': unescapeHTML(title),
1059 'thumbnail': video_thumb
1062 return [self.playlist_result(videos, gameID, game_title)]
1064 class UstreamIE(InfoExtractor):
1065 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1066 IE_NAME = u'ustream'
1068 def _real_extract(self, url):
1069 m = re.match(self._VALID_URL, url)
1070 video_id = m.group('videoID')
1072 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1073 webpage = self._download_webpage(url, video_id)
1075 self.report_extraction(video_id)
1077 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1080 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1081 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1083 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1084 webpage, u'thumbnail', fatal=False)
1090 'title': video_title,
1091 'uploader': uploader,
1092 'thumbnail': thumbnail,
1096 class WorldStarHipHopIE(InfoExtractor):
1097 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1098 IE_NAME = u'WorldStarHipHop'
1100 def _real_extract(self, url):
1101 m = re.match(self._VALID_URL, url)
1102 video_id = m.group('id')
1104 webpage_src = self._download_webpage(url, video_id)
1106 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1107 webpage_src, u'video URL')
1109 if 'mp4' in video_url:
1114 video_title = self._html_search_regex(r"<title>(.*)</title>",
1115 webpage_src, u'title')
1117 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1118 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1119 webpage_src, u'thumbnail', fatal=False)
1122 _title = r"""candytitles.*>(.*)</span>"""
1123 mobj = re.search(_title, webpage_src)
1124 if mobj is not None:
1125 video_title = mobj.group(1)
1130 'title' : video_title,
1131 'thumbnail' : thumbnail,
1136 class RBMARadioIE(InfoExtractor):
1137 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1139 def _real_extract(self, url):
1140 m = re.match(self._VALID_URL, url)
1141 video_id = m.group('videoID')
1143 webpage = self._download_webpage(url, video_id)
1145 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1146 webpage, u'json data', flags=re.MULTILINE)
1149 data = json.loads(json_data)
1150 except ValueError as e:
1151 raise ExtractorError(u'Invalid JSON: ' + str(e))
1153 video_url = data['akamai_url'] + '&cbr=256'
1154 url_parts = compat_urllib_parse_urlparse(video_url)
1155 video_ext = url_parts.path.rpartition('.')[2]
1160 'title': data['title'],
1161 'description': data.get('teaser_text'),
1162 'location': data.get('country_of_origin'),
1163 'uploader': data.get('host', {}).get('name'),
1164 'uploader_id': data.get('host', {}).get('slug'),
1165 'thumbnail': data.get('image', {}).get('large_url_2x'),
1166 'duration': data.get('duration'),
1171 class YouPornIE(InfoExtractor):
1172 """Information extractor for youporn.com."""
1173 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1175 def _print_formats(self, formats):
1176 """Print all available formats"""
1177 print(u'Available formats:')
1178 print(u'ext\t\tformat')
1179 print(u'---------------------------------')
1180 for format in formats:
1181 print(u'%s\t\t%s' % (format['ext'], format['format']))
1183 def _specific(self, req_format, formats):
1185 if(x["format"]==req_format):
1189 def _real_extract(self, url):
1190 mobj = re.match(self._VALID_URL, url)
1192 raise ExtractorError(u'Invalid URL: %s' % url)
1193 video_id = mobj.group('videoid')
1195 req = compat_urllib_request.Request(url)
1196 req.add_header('Cookie', 'age_verified=1')
1197 webpage = self._download_webpage(req, video_id)
1199 # Get JSON parameters
1200 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1202 params = json.loads(json_params)
1204 raise ExtractorError(u'Invalid JSON')
1206 self.report_extraction(video_id)
1208 video_title = params['title']
1209 upload_date = unified_strdate(params['release_date_f'])
1210 video_description = params['description']
1211 video_uploader = params['submitted_by']
1212 thumbnail = params['thumbnails'][0]['image']
1214 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1216 # Get all of the formats available
1217 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1218 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1219 webpage, u'download list').strip()
1221 # Get all of the links from the page
1222 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1223 links = re.findall(LINK_RE, download_list_html)
1224 if(len(links) == 0):
1225 raise ExtractorError(u'ERROR: no known formats available for video')
1227 self.to_screen(u'Links found: %d' % len(links))
1232 # A link looks like this:
1233 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1234 # A path looks like this:
1235 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1236 video_url = unescapeHTML( link )
1237 path = compat_urllib_parse_urlparse( video_url ).path
1238 extension = os.path.splitext( path )[1][1:]
1239 format = path.split('/')[4].split('_')[:2]
1242 format = "-".join( format )
1243 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1248 'uploader': video_uploader,
1249 'upload_date': upload_date,
1250 'title': video_title,
1253 'thumbnail': thumbnail,
1254 'description': video_description
1257 if self._downloader.params.get('listformats', None):
1258 self._print_formats(formats)
1261 req_format = self._downloader.params.get('format', None)
1262 self.to_screen(u'Format: %s' % req_format)
1264 if req_format is None or req_format == 'best':
1266 elif req_format == 'worst':
1267 return [formats[-1]]
1268 elif req_format in ('-1', 'all'):
1271 format = self._specific( req_format, formats )
1273 raise ExtractorError(u'Requested format not available')
1278 class PornotubeIE(InfoExtractor):
1279 """Information extractor for pornotube.com."""
1280 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1282 def _real_extract(self, url):
1283 mobj = re.match(self._VALID_URL, url)
1285 raise ExtractorError(u'Invalid URL: %s' % url)
1287 video_id = mobj.group('videoid')
1288 video_title = mobj.group('title')
1290 # Get webpage content
1291 webpage = self._download_webpage(url, video_id)
1294 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1295 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1296 video_url = compat_urllib_parse.unquote(video_url)
1298 #Get the uploaded date
1299 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1300 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1301 if upload_date: upload_date = unified_strdate(upload_date)
1303 info = {'id': video_id,
1306 'upload_date': upload_date,
1307 'title': video_title,
1313 class YouJizzIE(InfoExtractor):
1314 """Information extractor for youjizz.com."""
1315 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1317 def _real_extract(self, url):
1318 mobj = re.match(self._VALID_URL, url)
1320 raise ExtractorError(u'Invalid URL: %s' % url)
1322 video_id = mobj.group('videoid')
1324 # Get webpage content
1325 webpage = self._download_webpage(url, video_id)
1327 # Get the video title
1328 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1329 webpage, u'title').strip()
1331 # Get the embed page
1332 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1334 raise ExtractorError(u'ERROR: unable to extract embed page')
1336 embed_page_url = result.group(0).strip()
1337 video_id = result.group('videoid')
1339 webpage = self._download_webpage(embed_page_url, video_id)
1342 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1343 webpage, u'video URL')
1345 info = {'id': video_id,
1347 'title': video_title,
1350 'player_url': embed_page_url}
1354 class EightTracksIE(InfoExtractor):
1356 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1358 def _real_extract(self, url):
1359 mobj = re.match(self._VALID_URL, url)
1361 raise ExtractorError(u'Invalid URL: %s' % url)
1362 playlist_id = mobj.group('id')
1364 webpage = self._download_webpage(url, playlist_id)
1366 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1367 data = json.loads(json_like)
1369 session = str(random.randint(0, 1000000000))
1371 track_count = data['tracks_count']
1372 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1373 next_url = first_url
1375 for i in itertools.count():
1376 api_json = self._download_webpage(next_url, playlist_id,
1377 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1378 errnote=u'Failed to download song information')
1379 api_data = json.loads(api_json)
1380 track_data = api_data[u'set']['track']
1382 'id': track_data['id'],
1383 'url': track_data['track_file_stream_url'],
1384 'title': track_data['performer'] + u' - ' + track_data['name'],
1385 'raw_title': track_data['name'],
1386 'uploader_id': data['user']['login'],
1390 if api_data['set']['at_last_track']:
1392 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1395 class KeekIE(InfoExtractor):
1396 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1399 def _real_extract(self, url):
1400 m = re.match(self._VALID_URL, url)
1401 video_id = m.group('videoID')
1403 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1404 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1405 webpage = self._download_webpage(url, video_id)
1407 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1410 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1411 webpage, u'uploader', fatal=False)
1417 'title': video_title,
1418 'thumbnail': thumbnail,
1419 'uploader': uploader
1423 class TEDIE(InfoExtractor):
1424 _VALID_URL=r'''http://www\.ted\.com/
1426 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1428 ((?P<type_talk>talks)) # We have a simple talk
1430 (/lang/(.*?))? # The url may contain the language
1431 /(?P<name>\w+) # Here goes the name and then ".html"
1435 def suitable(cls, url):
1436 """Receives a URL and returns True if suitable for this IE."""
1437 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1439 def _real_extract(self, url):
1440 m=re.match(self._VALID_URL, url, re.VERBOSE)
1441 if m.group('type_talk'):
1442 return [self._talk_info(url)]
1444 playlist_id=m.group('playlist_id')
1445 name=m.group('name')
1446 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1447 return [self._playlist_videos_info(url,name,playlist_id)]
1449 def _playlist_videos_info(self,url,name,playlist_id=0):
1450 '''Returns the videos of the playlist'''
1452 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1453 ([.\s]*?)data-playlist_item_id="(\d+)"
1454 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1456 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1457 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1458 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1459 m_names=re.finditer(video_name_RE,webpage)
1461 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1462 webpage, 'playlist title')
1464 playlist_entries = []
1465 for m_video, m_name in zip(m_videos,m_names):
1466 video_id=m_video.group('video_id')
1467 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1468 playlist_entries.append(self.url_result(talk_url, 'TED'))
1469 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1471 def _talk_info(self, url, video_id=0):
1472 """Return the video for the talk in the url"""
1473 m = re.match(self._VALID_URL, url,re.VERBOSE)
1474 video_name = m.group('name')
1475 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1476 self.report_extraction(video_name)
1477 # If the url includes the language we get the title translated
1478 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1480 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1481 webpage, 'json data')
1482 info = json.loads(json_data)
1483 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1484 webpage, 'description', flags = re.DOTALL)
1486 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1487 webpage, 'thumbnail')
1490 'url': info['htmlStreams'][-1]['file'],
1493 'thumbnail': thumbnail,
1494 'description': desc,
1498 class MySpassIE(InfoExtractor):
1499 _VALID_URL = r'http://www.myspass.de/.*'
1501 def _real_extract(self, url):
1502 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1504 # video id is the last path element of the URL
1505 # usually there is a trailing slash, so also try the second but last
1506 url_path = compat_urllib_parse_urlparse(url).path
1507 url_parent_path, video_id = os.path.split(url_path)
1509 _, video_id = os.path.split(url_parent_path)
1512 metadata_url = META_DATA_URL_TEMPLATE % video_id
1513 metadata_text = self._download_webpage(metadata_url, video_id)
1514 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1516 # extract values from metadata
1517 url_flv_el = metadata.find('url_flv')
1518 if url_flv_el is None:
1519 raise ExtractorError(u'Unable to extract download url')
1520 video_url = url_flv_el.text
1521 extension = os.path.splitext(video_url)[1][1:]
1522 title_el = metadata.find('title')
1523 if title_el is None:
1524 raise ExtractorError(u'Unable to extract title')
1525 title = title_el.text
1526 format_id_el = metadata.find('format_id')
1527 if format_id_el is None:
1530 format = format_id_el.text
1531 description_el = metadata.find('description')
1532 if description_el is not None:
1533 description = description_el.text
1536 imagePreview_el = metadata.find('imagePreview')
1537 if imagePreview_el is not None:
1538 thumbnail = imagePreview_el.text
1547 'thumbnail': thumbnail,
1548 'description': description
1552 class SpiegelIE(InfoExtractor):
1553 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1555 def _real_extract(self, url):
1556 m = re.match(self._VALID_URL, url)
1557 video_id = m.group('videoID')
1559 webpage = self._download_webpage(url, video_id)
1561 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1564 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1565 xml_code = self._download_webpage(xml_url, video_id,
1566 note=u'Downloading XML', errnote=u'Failed to download XML')
1568 idoc = xml.etree.ElementTree.fromstring(xml_code)
1569 last_type = idoc[-1]
1570 filename = last_type.findall('./filename')[0].text
1571 duration = float(last_type.findall('./duration')[0].text)
1573 video_url = 'http://video2.spiegel.de/flash/' + filename
1574 video_ext = filename.rpartition('.')[2]
1579 'title': video_title,
1580 'duration': duration,
1584 class LiveLeakIE(InfoExtractor):
1586 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1587 IE_NAME = u'liveleak'
1589 def _real_extract(self, url):
1590 mobj = re.match(self._VALID_URL, url)
1592 raise ExtractorError(u'Invalid URL: %s' % url)
1594 video_id = mobj.group('video_id')
1596 webpage = self._download_webpage(url, video_id)
1598 video_url = self._search_regex(r'file: "(.*?)",',
1599 webpage, u'video URL')
1601 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1602 webpage, u'title').replace('LiveLeak.com -', '').strip()
1604 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1605 webpage, u'description', fatal=False)
1607 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1608 webpage, u'uploader', fatal=False)
1614 'title': video_title,
1615 'description': video_description,
1616 'uploader': video_uploader
1623 class TumblrIE(InfoExtractor):
1624 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1626 def _real_extract(self, url):
1627 m_url = re.match(self._VALID_URL, url)
1628 video_id = m_url.group('id')
1629 blog = m_url.group('blog_name')
1631 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1632 webpage = self._download_webpage(url, video_id)
1634 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1635 video = re.search(re_video, webpage)
1637 raise ExtractorError(u'Unable to extract video')
1638 video_url = video.group('video_url')
1639 ext = video.group('ext')
1641 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1642 webpage, u'thumbnail', fatal=False) # We pick the first poster
1643 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1645 # The only place where you can get a title, it's not complete,
1646 # but searching in other places doesn't work for all videos
1647 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1648 webpage, u'title', flags=re.DOTALL)
1650 return [{'id': video_id,
1652 'title': video_title,
1653 'thumbnail': video_thumbnail,
1657 class BandcampIE(InfoExtractor):
1658 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1660 def _real_extract(self, url):
1661 mobj = re.match(self._VALID_URL, url)
1662 title = mobj.group('title')
1663 webpage = self._download_webpage(url, title)
1664 # We get the link to the free download page
1665 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1666 if m_download is None:
1667 raise ExtractorError(u'No free songs found')
1669 download_link = m_download.group(1)
1670 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1671 webpage, re.MULTILINE|re.DOTALL).group('id')
1673 download_webpage = self._download_webpage(download_link, id,
1674 'Downloading free downloads page')
1675 # We get the dictionary of the track from some javascrip code
1676 info = re.search(r'items: (.*?),$',
1677 download_webpage, re.MULTILINE).group(1)
1678 info = json.loads(info)[0]
1679 # We pick mp3-320 for now, until format selection can be easily implemented.
1680 mp3_info = info[u'downloads'][u'mp3-320']
1681 # If we try to use this url it says the link has expired
1682 initial_url = mp3_info[u'url']
1683 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1684 m_url = re.match(re_url, initial_url)
1685 #We build the url we will use to get the final track url
1686 # This url is build in Bandcamp in the script download_bunde_*.js
1687 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1688 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1689 # If we could correctly generate the .rand field the url would be
1690 #in the "download_url" key
1691 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1693 track_info = {'id':id,
1694 'title' : info[u'title'],
1697 'thumbnail' : info[u'thumb_url'],
1698 'uploader' : info[u'artist']
1703 class RedTubeIE(InfoExtractor):
1704 """Information Extractor for redtube"""
1705 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1707 def _real_extract(self,url):
1708 mobj = re.match(self._VALID_URL, url)
1710 raise ExtractorError(u'Invalid URL: %s' % url)
1712 video_id = mobj.group('id')
1713 video_extension = 'mp4'
1714 webpage = self._download_webpage(url, video_id)
1716 self.report_extraction(video_id)
1718 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1719 webpage, u'video URL')
1721 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1727 'ext': video_extension,
1728 'title': video_title,
1731 class InaIE(InfoExtractor):
1732 """Information Extractor for Ina.fr"""
1733 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1735 def _real_extract(self,url):
1736 mobj = re.match(self._VALID_URL, url)
1738 video_id = mobj.group('id')
1739 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1740 video_extension = 'mp4'
1741 webpage = self._download_webpage(mrss_url, video_id)
1743 self.report_extraction(video_id)
1745 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1746 webpage, u'video URL')
1748 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1754 'ext': video_extension,
1755 'title': video_title,
1758 class HowcastIE(InfoExtractor):
1759 """Information Extractor for Howcast.com"""
1760 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1762 def _real_extract(self, url):
1763 mobj = re.match(self._VALID_URL, url)
1765 video_id = mobj.group('id')
1766 webpage_url = 'http://www.howcast.com/videos/' + video_id
1767 webpage = self._download_webpage(webpage_url, video_id)
1769 self.report_extraction(video_id)
1771 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1772 webpage, u'video URL')
1774 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1777 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1778 webpage, u'description', fatal=False)
1780 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1781 webpage, u'thumbnail', fatal=False)
1787 'title': video_title,
1788 'description': video_description,
1789 'thumbnail': thumbnail,
1792 class VineIE(InfoExtractor):
1793 """Information Extractor for Vine.co"""
1794 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1796 def _real_extract(self, url):
1797 mobj = re.match(self._VALID_URL, url)
1799 video_id = mobj.group('id')
1800 webpage_url = 'https://vine.co/v/' + video_id
1801 webpage = self._download_webpage(webpage_url, video_id)
1803 self.report_extraction(video_id)
1805 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1806 webpage, u'video URL')
1808 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1811 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1812 webpage, u'thumbnail', fatal=False)
1814 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1815 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1821 'title': video_title,
1822 'thumbnail': thumbnail,
1823 'uploader': uploader,
1826 class FlickrIE(InfoExtractor):
1827 """Information Extractor for Flickr videos"""
1828 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1830 def _real_extract(self, url):
1831 mobj = re.match(self._VALID_URL, url)
1833 video_id = mobj.group('id')
1834 video_uploader_id = mobj.group('uploader_id')
1835 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1836 webpage = self._download_webpage(webpage_url, video_id)
1838 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1840 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1841 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1843 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1844 first_xml, u'node_id')
1846 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1847 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1849 self.report_extraction(video_id)
1851 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1853 raise ExtractorError(u'Unable to extract video url')
1854 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1856 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1857 webpage, u'video title')
1859 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1860 webpage, u'description', fatal=False)
1862 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1863 webpage, u'thumbnail', fatal=False)
1869 'title': video_title,
1870 'description': video_description,
1871 'thumbnail': thumbnail,
1872 'uploader_id': video_uploader_id,
1875 class TeamcocoIE(InfoExtractor):
1876 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1878 def _real_extract(self, url):
1879 mobj = re.match(self._VALID_URL, url)
1881 raise ExtractorError(u'Invalid URL: %s' % url)
1882 url_title = mobj.group('url_title')
1883 webpage = self._download_webpage(url, url_title)
1885 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1886 webpage, u'video id')
1888 self.report_extraction(video_id)
1890 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1893 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1894 webpage, u'thumbnail', fatal=False)
1896 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1897 webpage, u'description', fatal=False)
1899 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1900 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1902 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1909 'title': video_title,
1910 'thumbnail': thumbnail,
1911 'description': video_description,
1914 class XHamsterIE(InfoExtractor):
1915 """Information Extractor for xHamster"""
1916 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1918 def _real_extract(self,url):
1919 mobj = re.match(self._VALID_URL, url)
1921 video_id = mobj.group('id')
1922 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1923 webpage = self._download_webpage(mrss_url, video_id)
1925 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1927 raise ExtractorError(u'Unable to extract media URL')
1928 if len(mobj.group('server')) == 0:
1929 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1931 video_url = mobj.group('server')+'/key='+mobj.group('file')
1932 video_extension = video_url.split('.')[-1]
1934 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1937 # Can't see the description anywhere in the UI
1938 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1939 # webpage, u'description', fatal=False)
1940 # if video_description: video_description = unescapeHTML(video_description)
1942 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1944 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1946 video_upload_date = None
1947 self._downloader.report_warning(u'Unable to extract upload date')
1949 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1950 webpage, u'uploader id', default=u'anonymous')
1952 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1953 webpage, u'thumbnail', fatal=False)
1958 'ext': video_extension,
1959 'title': video_title,
1960 # 'description': video_description,
1961 'upload_date': video_upload_date,
1962 'uploader_id': video_uploader_id,
1963 'thumbnail': video_thumbnail
1966 class HypemIE(InfoExtractor):
1967 """Information Extractor for hypem"""
1968 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1970 def _real_extract(self, url):
1971 mobj = re.match(self._VALID_URL, url)
1973 raise ExtractorError(u'Invalid URL: %s' % url)
1974 track_id = mobj.group(1)
1976 data = { 'ax': 1, 'ts': time.time() }
1977 data_encoded = compat_urllib_parse.urlencode(data)
1978 complete_url = url + "?" + data_encoded
1979 request = compat_urllib_request.Request(complete_url)
1980 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1981 cookie = urlh.headers.get('Set-Cookie', '')
1983 self.report_extraction(track_id)
1985 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1986 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1988 track_list = json.loads(html_tracks)
1989 track = track_list[u'tracks'][0]
1991 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1994 track_id = track[u"id"]
1995 artist = track[u"artist"]
1996 title = track[u"song"]
1998 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1999 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2000 request.add_header('cookie', cookie)
2001 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2003 song_data = json.loads(song_data_json)
2005 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2006 final_url = song_data[u"url"]
2016 class Vbox7IE(InfoExtractor):
2017 """Information Extractor for Vbox7"""
2018 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2020 def _real_extract(self,url):
2021 mobj = re.match(self._VALID_URL, url)
2023 raise ExtractorError(u'Invalid URL: %s' % url)
2024 video_id = mobj.group(1)
2026 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2027 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2028 redirect_url = urlh.geturl() + new_location
2029 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2031 title = self._html_search_regex(r'<title>(.*)</title>',
2032 webpage, u'title').split('/')[0].strip()
2035 info_url = "http://vbox7.com/play/magare.do"
2036 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2037 info_request = compat_urllib_request.Request(info_url, data)
2038 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2039 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2040 if info_response is None:
2041 raise ExtractorError(u'Unable to extract the media url')
2042 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2049 'thumbnail': thumbnail_url,
2053 def gen_extractors():
2054 """ Return a list of an instance of every supported extractor.
2055 The order does matter; the first extractor matched is the one handling the URL.
2058 YoutubePlaylistIE(),
2083 StanfordOpenClassroomIE(),
2093 WorldStarHipHopIE(),
2123 def get_info_extractor(ie_name):
2124 """Returns the info extractor class with the given ie_name"""
2125 return globals()[ie_name+'IE']