10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.depositfiles import DepositFilesIE
27 from .extractor.facebook import FacebookIE
28 from .extractor.gametrailers import GametrailersIE
29 from .extractor.generic import GenericIE
30 from .extractor.googleplus import GooglePlusIE
31 from .extractor.googlesearch import GoogleSearchIE
32 from .extractor.metacafe import MetacafeIE
33 from .extractor.myvideo import MyVideoIE
34 from .extractor.statigram import StatigramIE
35 from .extractor.photobucket import PhotobucketIE
36 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
37 from .extractor.vimeo import VimeoIE
38 from .extractor.yahoo import YahooIE, YahooSearchIE
39 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
40 from .extractor.zdf import ZDFIE
69 class EscapistIE(InfoExtractor):
70 """Information extractor for The Escapist """
72 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
75 def _real_extract(self, url):
76 mobj = re.match(self._VALID_URL, url)
78 raise ExtractorError(u'Invalid URL: %s' % url)
79 showName = mobj.group('showname')
80 videoId = mobj.group('episode')
82 self.report_extraction(videoId)
83 webpage = self._download_webpage(url, videoId)
85 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
86 webpage, u'description', fatal=False)
88 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
89 webpage, u'thumbnail', fatal=False)
91 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
92 webpage, u'player url')
94 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
95 webpage, u'player url').split(' : ')[-1]
97 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
98 configUrl = compat_urllib_parse.unquote(configUrl)
100 configJSON = self._download_webpage(configUrl, videoId,
101 u'Downloading configuration',
102 u'unable to download configuration')
104 # Technically, it's JavaScript, not JSON
105 configJSON = configJSON.replace("'", '"')
108 config = json.loads(configJSON)
109 except (ValueError,) as err:
110 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
112 playlist = config['playlist']
113 videoUrl = playlist[1]['url']
118 'uploader': showName,
123 'description': videoDesc,
124 'player_url': playerUrl,
129 class CollegeHumorIE(InfoExtractor):
130 """Information extractor for collegehumor.com"""
133 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
134 IE_NAME = u'collegehumor'
136 def report_manifest(self, video_id):
137 """Report information extraction."""
138 self.to_screen(u'%s: Downloading XML manifest' % video_id)
140 def _real_extract(self, url):
141 mobj = re.match(self._VALID_URL, url)
143 raise ExtractorError(u'Invalid URL: %s' % url)
144 video_id = mobj.group('videoid')
152 self.report_extraction(video_id)
153 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
155 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
157 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
159 mdoc = xml.etree.ElementTree.fromstring(metaXml)
161 videoNode = mdoc.findall('./video')[0]
162 info['description'] = videoNode.findall('./description')[0].text
163 info['title'] = videoNode.findall('./caption')[0].text
164 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
165 manifest_url = videoNode.findall('./file')[0].text
167 raise ExtractorError(u'Invalid metadata XML file')
169 manifest_url += '?hdcore=2.10.3'
170 self.report_manifest(video_id)
172 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
173 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
174 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
176 adoc = xml.etree.ElementTree.fromstring(manifestXml)
178 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
179 node_id = media_node.attrib['url']
180 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
181 except IndexError as err:
182 raise ExtractorError(u'Invalid manifest file')
184 url_pr = compat_urllib_parse_urlparse(manifest_url)
185 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
192 class XVideosIE(InfoExtractor):
193 """Information extractor for xvideos.com"""
195 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
198 def _real_extract(self, url):
199 mobj = re.match(self._VALID_URL, url)
201 raise ExtractorError(u'Invalid URL: %s' % url)
202 video_id = mobj.group(1)
204 webpage = self._download_webpage(url, video_id)
206 self.report_extraction(video_id)
209 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
210 webpage, u'video URL'))
213 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
216 # Extract video thumbnail
217 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
218 webpage, u'thumbnail', fatal=False)
225 'title': video_title,
227 'thumbnail': video_thumbnail,
236 class InfoQIE(InfoExtractor):
237 """Information extractor for infoq.com"""
238 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
240 def _real_extract(self, url):
241 mobj = re.match(self._VALID_URL, url)
243 raise ExtractorError(u'Invalid URL: %s' % url)
245 webpage = self._download_webpage(url, video_id=url)
246 self.report_extraction(url)
249 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
251 raise ExtractorError(u'Unable to extract video url')
252 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
253 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
256 video_title = self._search_regex(r'contentTitle = "(.*?)";',
259 # Extract description
260 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
261 webpage, u'description', fatal=False)
263 video_filename = video_url.split('/')[-1]
264 video_id, extension = video_filename.split('.')
271 'title': video_title,
272 'ext': extension, # Extension is always(?) mp4, but seems to be flv
274 'description': video_description,
279 class MixcloudIE(InfoExtractor):
280 """Information extractor for www.mixcloud.com"""
282 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
283 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
284 IE_NAME = u'mixcloud'
286 def report_download_json(self, file_id):
287 """Report JSON download."""
288 self.to_screen(u'Downloading json')
290 def get_urls(self, jsonData, fmt, bitrate='best'):
291 """Get urls from 'audio_formats' section in json"""
294 bitrate_list = jsonData[fmt]
295 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
296 bitrate = max(bitrate_list) # select highest
298 url_list = jsonData[fmt][bitrate]
299 except TypeError: # we have no bitrate info.
300 url_list = jsonData[fmt]
303 def check_urls(self, url_list):
304 """Returns 1st active url from list"""
307 compat_urllib_request.urlopen(url)
309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314 def _print_formats(self, formats):
315 print('Available formats:')
316 for fmt in formats.keys():
317 for b in formats[fmt]:
319 ext = formats[fmt][b][0]
320 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
321 except TypeError: # we have no bitrate info
322 ext = formats[fmt][0]
323 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
326 def _real_extract(self, url):
327 mobj = re.match(self._VALID_URL, url)
329 raise ExtractorError(u'Invalid URL: %s' % url)
330 # extract uploader & filename from url
331 uploader = mobj.group(1).decode('utf-8')
332 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
334 # construct API request
335 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
336 # retrieve .json file with links to files
337 request = compat_urllib_request.Request(file_url)
339 self.report_download_json(file_url)
340 jsonData = compat_urllib_request.urlopen(request).read()
341 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
342 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
345 json_data = json.loads(jsonData)
346 player_url = json_data['player_swf_url']
347 formats = dict(json_data['audio_formats'])
349 req_format = self._downloader.params.get('format', None)
352 if self._downloader.params.get('listformats', None):
353 self._print_formats(formats)
356 if req_format is None or req_format == 'best':
357 for format_param in formats.keys():
358 url_list = self.get_urls(formats, format_param)
360 file_url = self.check_urls(url_list)
361 if file_url is not None:
364 if req_format not in formats:
365 raise ExtractorError(u'Format is not available')
367 url_list = self.get_urls(formats, req_format)
368 file_url = self.check_urls(url_list)
369 format_param = req_format
372 'id': file_id.decode('utf-8'),
373 'url': file_url.decode('utf-8'),
374 'uploader': uploader.decode('utf-8'),
376 'title': json_data['name'],
377 'ext': file_url.split('.')[-1].decode('utf-8'),
378 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
379 'thumbnail': json_data['thumbnail_url'],
380 'description': json_data['description'],
381 'player_url': player_url.decode('utf-8'),
384 class StanfordOpenClassroomIE(InfoExtractor):
385 """Information extractor for Stanford's Open ClassRoom"""
387 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
388 IE_NAME = u'stanfordoc'
390 def _real_extract(self, url):
391 mobj = re.match(self._VALID_URL, url)
393 raise ExtractorError(u'Invalid URL: %s' % url)
395 if mobj.group('course') and mobj.group('video'): # A specific video
396 course = mobj.group('course')
397 video = mobj.group('video')
399 'id': course + '_' + video,
404 self.report_extraction(info['id'])
405 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
406 xmlUrl = baseUrl + video + '.xml'
408 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
411 mdoc = xml.etree.ElementTree.fromstring(metaXml)
413 info['title'] = mdoc.findall('./title')[0].text
414 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
416 raise ExtractorError(u'Invalid metadata XML file')
417 info['ext'] = info['url'].rpartition('.')[2]
419 elif mobj.group('course'): # A course page
420 course = mobj.group('course')
428 coursepage = self._download_webpage(url, info['id'],
429 note='Downloading course info page',
430 errnote='Unable to download course info page')
432 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
434 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
435 coursepage, u'description', fatal=False)
437 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
441 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
445 for entry in info['list']:
446 assert entry['type'] == 'reference'
447 results += self.extract(entry['url'])
451 'id': 'Stanford OpenClassroom',
457 self.report_download_webpage(info['id'])
458 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
460 rootpage = compat_urllib_request.urlopen(rootURL).read()
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
464 info['title'] = info['id']
466 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
470 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
475 for entry in info['list']:
476 assert entry['type'] == 'reference'
477 results += self.extract(entry['url'])
480 class MTVIE(InfoExtractor):
481 """Information extractor for MTV.com"""
483 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
486 def _real_extract(self, url):
487 mobj = re.match(self._VALID_URL, url)
489 raise ExtractorError(u'Invalid URL: %s' % url)
490 if not mobj.group('proto'):
491 url = 'http://' + url
492 video_id = mobj.group('videoid')
494 webpage = self._download_webpage(url, video_id)
496 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
497 webpage, u'song name', fatal=False)
499 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
502 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
503 webpage, u'mtvn_uri', fatal=False)
505 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
506 webpage, u'content id', fatal=False)
508 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
509 self.report_extraction(video_id)
510 request = compat_urllib_request.Request(videogen_url)
512 metadataXml = compat_urllib_request.urlopen(request).read()
513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
514 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
516 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
517 renditions = mdoc.findall('.//rendition')
519 # For now, always pick the highest quality.
520 rendition = renditions[-1]
523 _,_,ext = rendition.attrib['type'].partition('/')
524 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
525 video_url = rendition.find('./src').text
527 raise ExtractorError('Invalid rendition field.')
532 'uploader': performer,
534 'title': video_title,
542 class YoukuIE(InfoExtractor):
543 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
546 nowTime = int(time.time() * 1000)
547 random1 = random.randint(1000,1998)
548 random2 = random.randint(1000,9999)
550 return "%d%d%d" %(nowTime,random1,random2)
552 def _get_file_ID_mix_string(self, seed):
554 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
556 for i in range(len(source)):
557 seed = (seed * 211 + 30031 ) % 65536
558 index = math.floor(seed / 65536 * len(source) )
559 mixed.append(source[int(index)])
560 source.remove(source[int(index)])
561 #return ''.join(mixed)
564 def _get_file_id(self, fileId, seed):
565 mixed = self._get_file_ID_mix_string(seed)
566 ids = fileId.split('*')
570 realId.append(mixed[int(ch)])
571 return ''.join(realId)
573 def _real_extract(self, url):
574 mobj = re.match(self._VALID_URL, url)
576 raise ExtractorError(u'Invalid URL: %s' % url)
577 video_id = mobj.group('ID')
579 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
581 jsondata = self._download_webpage(info_url, video_id)
583 self.report_extraction(video_id)
585 config = json.loads(jsondata)
587 video_title = config['data'][0]['title']
588 seed = config['data'][0]['seed']
590 format = self._downloader.params.get('format', None)
591 supported_format = list(config['data'][0]['streamfileids'].keys())
593 if format is None or format == 'best':
594 if 'hd2' in supported_format:
599 elif format == 'worst':
607 fileid = config['data'][0]['streamfileids'][format]
608 keys = [s['k'] for s in config['data'][0]['segs'][format]]
609 except (UnicodeDecodeError, ValueError, KeyError):
610 raise ExtractorError(u'Unable to extract info section')
613 sid = self._gen_sid()
614 fileid = self._get_file_id(fileid, seed)
616 #column 8,9 of fileid represent the segment number
617 #fileid[7:9] should be changed
618 for index, key in enumerate(keys):
620 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
621 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
624 'id': '%s_part%02d' % (video_id, index),
628 'title': video_title,
631 files_info.append(info)
636 class XNXXIE(InfoExtractor):
637 """Information extractor for xnxx.com"""
639 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
641 VIDEO_URL_RE = r'flv_url=(.*?)&'
642 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
643 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
645 def _real_extract(self, url):
646 mobj = re.match(self._VALID_URL, url)
648 raise ExtractorError(u'Invalid URL: %s' % url)
649 video_id = mobj.group(1)
651 # Get webpage content
652 webpage = self._download_webpage(url, video_id)
654 video_url = self._search_regex(self.VIDEO_URL_RE,
655 webpage, u'video URL')
656 video_url = compat_urllib_parse.unquote(video_url)
658 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
661 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
662 webpage, u'thumbnail', fatal=False)
669 'title': video_title,
671 'thumbnail': video_thumbnail,
677 class NBAIE(InfoExtractor):
678 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
681 def _real_extract(self, url):
682 mobj = re.match(self._VALID_URL, url)
684 raise ExtractorError(u'Invalid URL: %s' % url)
686 video_id = mobj.group(1)
688 webpage = self._download_webpage(url, video_id)
690 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
692 shortened_video_id = video_id.rpartition('/')[2]
693 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
694 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
696 # It isn't there in the HTML it returns to us
697 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
699 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
702 'id': shortened_video_id,
706 # 'uploader_date': uploader_date,
707 'description': description,
711 class JustinTVIE(InfoExtractor):
712 """Information extractor for justin.tv and twitch.tv"""
713 # TODO: One broadcast may be split into multiple videos. The key
714 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
715 # starts at 1 and increases. Can we treat all parts as one video?
717 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
719 (?P<channelid>[^/]+)|
720 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
721 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
725 _JUSTIN_PAGE_LIMIT = 100
726 IE_NAME = u'justin.tv'
728 def report_download_page(self, channel, offset):
729 """Report attempt to download a single page of videos."""
730 self.to_screen(u'%s: Downloading video information from %d to %d' %
731 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
733 # Return count of items, list of *valid* items
734 def _parse_page(self, url, video_id):
735 webpage = self._download_webpage(url, video_id,
736 u'Downloading video info JSON',
737 u'unable to download video info JSON')
739 response = json.loads(webpage)
740 if type(response) != list:
741 error_text = response.get('error', 'unknown error')
742 raise ExtractorError(u'Justin.tv API: %s' % error_text)
744 for clip in response:
745 video_url = clip['video_file_url']
747 video_extension = os.path.splitext(video_url)[1][1:]
748 video_date = re.sub('-', '', clip['start_time'][:10])
749 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
750 video_id = clip['id']
751 video_title = clip.get('title', video_id)
755 'title': video_title,
756 'uploader': clip.get('channel_name', video_uploader_id),
757 'uploader_id': video_uploader_id,
758 'upload_date': video_date,
759 'ext': video_extension,
761 return (len(response), info)
763 def _real_extract(self, url):
764 mobj = re.match(self._VALID_URL, url)
766 raise ExtractorError(u'invalid URL: %s' % url)
768 api_base = 'http://api.justin.tv'
770 if mobj.group('channelid'):
772 video_id = mobj.group('channelid')
773 api = api_base + '/channel/archives/%s.json' % video_id
774 elif mobj.group('chapterid'):
775 chapter_id = mobj.group('chapterid')
777 webpage = self._download_webpage(url, chapter_id)
778 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
780 raise ExtractorError(u'Cannot find archive of a chapter')
781 archive_id = m.group(1)
783 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
784 chapter_info_xml = self._download_webpage(api, chapter_id,
785 note=u'Downloading chapter information',
786 errnote=u'Chapter information download failed')
787 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
788 for a in doc.findall('.//archive'):
789 if archive_id == a.find('./id').text:
792 raise ExtractorError(u'Could not find chapter in chapter information')
794 video_url = a.find('./video_file_url').text
795 video_ext = video_url.rpartition('.')[2] or u'flv'
797 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
798 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
799 note='Downloading chapter metadata',
800 errnote='Download of chapter metadata failed')
801 chapter_info = json.loads(chapter_info_json)
803 bracket_start = int(doc.find('.//bracket_start').text)
804 bracket_end = int(doc.find('.//bracket_end').text)
806 # TODO determine start (and probably fix up file)
807 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
808 #video_url += u'?start=' + TODO:start_timestamp
809 # bracket_start is 13290, but we want 51670615
810 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
811 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
814 'id': u'c' + chapter_id,
817 'title': chapter_info['title'],
818 'thumbnail': chapter_info['preview'],
819 'description': chapter_info['description'],
820 'uploader': chapter_info['channel']['display_name'],
821 'uploader_id': chapter_info['channel']['name'],
825 video_id = mobj.group('videoid')
826 api = api_base + '/broadcast/by_archive/%s.json' % video_id
828 self.report_extraction(video_id)
832 limit = self._JUSTIN_PAGE_LIMIT
835 self.report_download_page(video_id, offset)
836 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
837 page_count, page_info = self._parse_page(page_url, video_id)
838 info.extend(page_info)
839 if not paged or page_count != limit:
844 class FunnyOrDieIE(InfoExtractor):
845 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
847 def _real_extract(self, url):
848 mobj = re.match(self._VALID_URL, url)
850 raise ExtractorError(u'invalid URL: %s' % url)
852 video_id = mobj.group('id')
853 webpage = self._download_webpage(url, video_id)
855 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
856 webpage, u'video URL', flags=re.DOTALL)
858 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
859 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
861 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
862 webpage, u'description', fatal=False, flags=re.DOTALL)
869 'description': video_description,
873 class SteamIE(InfoExtractor):
874 _VALID_URL = r"""http://store\.steampowered\.com/
876 (?P<urltype>video|app)/ #If the page is only for videos or for a game
878 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
880 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
881 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
884 def suitable(cls, url):
885 """Receives a URL and returns True if suitable for this IE."""
886 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
888 def _real_extract(self, url):
889 m = re.match(self._VALID_URL, url, re.VERBOSE)
890 gameID = m.group('gameID')
892 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
893 webpage = self._download_webpage(videourl, gameID)
895 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
896 videourl = self._AGECHECK_TEMPLATE % gameID
897 self.report_age_confirmation()
898 webpage = self._download_webpage(videourl, gameID)
900 self.report_extraction(gameID)
901 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
902 webpage, 'game title')
904 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
905 mweb = re.finditer(urlRE, webpage)
906 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
907 titles = re.finditer(namesRE, webpage)
908 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
909 thumbs = re.finditer(thumbsRE, webpage)
911 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
912 video_id = vid.group('videoID')
913 title = vtitle.group('videoName')
914 video_url = vid.group('videoURL')
915 video_thumb = thumb.group('thumbnail')
917 raise ExtractorError(u'Cannot find video url for %s' % video_id)
922 'title': unescapeHTML(title),
923 'thumbnail': video_thumb
926 return [self.playlist_result(videos, gameID, game_title)]
928 class UstreamIE(InfoExtractor):
929 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
932 def _real_extract(self, url):
933 m = re.match(self._VALID_URL, url)
934 video_id = m.group('videoID')
936 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
937 webpage = self._download_webpage(url, video_id)
939 self.report_extraction(video_id)
941 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
944 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
945 webpage, u'uploader', fatal=False, flags=re.DOTALL)
947 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
948 webpage, u'thumbnail', fatal=False)
954 'title': video_title,
955 'uploader': uploader,
956 'thumbnail': thumbnail,
960 class WorldStarHipHopIE(InfoExtractor):
961 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
962 IE_NAME = u'WorldStarHipHop'
964 def _real_extract(self, url):
965 m = re.match(self._VALID_URL, url)
966 video_id = m.group('id')
968 webpage_src = self._download_webpage(url, video_id)
970 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
971 webpage_src, u'video URL')
973 if 'mp4' in video_url:
978 video_title = self._html_search_regex(r"<title>(.*)</title>",
979 webpage_src, u'title')
981 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
982 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
983 webpage_src, u'thumbnail', fatal=False)
986 _title = r"""candytitles.*>(.*)</span>"""
987 mobj = re.search(_title, webpage_src)
989 video_title = mobj.group(1)
994 'title' : video_title,
995 'thumbnail' : thumbnail,
1000 class RBMARadioIE(InfoExtractor):
1001 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1003 def _real_extract(self, url):
1004 m = re.match(self._VALID_URL, url)
1005 video_id = m.group('videoID')
1007 webpage = self._download_webpage(url, video_id)
1009 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1010 webpage, u'json data', flags=re.MULTILINE)
1013 data = json.loads(json_data)
1014 except ValueError as e:
1015 raise ExtractorError(u'Invalid JSON: ' + str(e))
1017 video_url = data['akamai_url'] + '&cbr=256'
1018 url_parts = compat_urllib_parse_urlparse(video_url)
1019 video_ext = url_parts.path.rpartition('.')[2]
1024 'title': data['title'],
1025 'description': data.get('teaser_text'),
1026 'location': data.get('country_of_origin'),
1027 'uploader': data.get('host', {}).get('name'),
1028 'uploader_id': data.get('host', {}).get('slug'),
1029 'thumbnail': data.get('image', {}).get('large_url_2x'),
1030 'duration': data.get('duration'),
1035 class YouPornIE(InfoExtractor):
1036 """Information extractor for youporn.com."""
1037 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1039 def _print_formats(self, formats):
1040 """Print all available formats"""
1041 print(u'Available formats:')
1042 print(u'ext\t\tformat')
1043 print(u'---------------------------------')
1044 for format in formats:
1045 print(u'%s\t\t%s' % (format['ext'], format['format']))
1047 def _specific(self, req_format, formats):
1049 if(x["format"]==req_format):
1053 def _real_extract(self, url):
1054 mobj = re.match(self._VALID_URL, url)
1056 raise ExtractorError(u'Invalid URL: %s' % url)
1057 video_id = mobj.group('videoid')
1059 req = compat_urllib_request.Request(url)
1060 req.add_header('Cookie', 'age_verified=1')
1061 webpage = self._download_webpage(req, video_id)
1063 # Get JSON parameters
1064 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1066 params = json.loads(json_params)
1068 raise ExtractorError(u'Invalid JSON')
1070 self.report_extraction(video_id)
1072 video_title = params['title']
1073 upload_date = unified_strdate(params['release_date_f'])
1074 video_description = params['description']
1075 video_uploader = params['submitted_by']
1076 thumbnail = params['thumbnails'][0]['image']
1078 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1080 # Get all of the formats available
1081 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1082 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1083 webpage, u'download list').strip()
1085 # Get all of the links from the page
1086 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1087 links = re.findall(LINK_RE, download_list_html)
1088 if(len(links) == 0):
1089 raise ExtractorError(u'ERROR: no known formats available for video')
1091 self.to_screen(u'Links found: %d' % len(links))
1096 # A link looks like this:
1097 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1098 # A path looks like this:
1099 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1100 video_url = unescapeHTML( link )
1101 path = compat_urllib_parse_urlparse( video_url ).path
1102 extension = os.path.splitext( path )[1][1:]
1103 format = path.split('/')[4].split('_')[:2]
1106 format = "-".join( format )
1107 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1112 'uploader': video_uploader,
1113 'upload_date': upload_date,
1114 'title': video_title,
1117 'thumbnail': thumbnail,
1118 'description': video_description
1121 if self._downloader.params.get('listformats', None):
1122 self._print_formats(formats)
1125 req_format = self._downloader.params.get('format', None)
1126 self.to_screen(u'Format: %s' % req_format)
1128 if req_format is None or req_format == 'best':
1130 elif req_format == 'worst':
1131 return [formats[-1]]
1132 elif req_format in ('-1', 'all'):
1135 format = self._specific( req_format, formats )
1137 raise ExtractorError(u'Requested format not available')
1142 class PornotubeIE(InfoExtractor):
1143 """Information extractor for pornotube.com."""
1144 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1146 def _real_extract(self, url):
1147 mobj = re.match(self._VALID_URL, url)
1149 raise ExtractorError(u'Invalid URL: %s' % url)
1151 video_id = mobj.group('videoid')
1152 video_title = mobj.group('title')
1154 # Get webpage content
1155 webpage = self._download_webpage(url, video_id)
1158 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1159 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1160 video_url = compat_urllib_parse.unquote(video_url)
1162 #Get the uploaded date
1163 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1164 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1165 if upload_date: upload_date = unified_strdate(upload_date)
1167 info = {'id': video_id,
1170 'upload_date': upload_date,
1171 'title': video_title,
1177 class YouJizzIE(InfoExtractor):
1178 """Information extractor for youjizz.com."""
1179 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1181 def _real_extract(self, url):
1182 mobj = re.match(self._VALID_URL, url)
1184 raise ExtractorError(u'Invalid URL: %s' % url)
1186 video_id = mobj.group('videoid')
1188 # Get webpage content
1189 webpage = self._download_webpage(url, video_id)
1191 # Get the video title
1192 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1193 webpage, u'title').strip()
1195 # Get the embed page
1196 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1198 raise ExtractorError(u'ERROR: unable to extract embed page')
1200 embed_page_url = result.group(0).strip()
1201 video_id = result.group('videoid')
1203 webpage = self._download_webpage(embed_page_url, video_id)
1206 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1207 webpage, u'video URL')
1209 info = {'id': video_id,
1211 'title': video_title,
1214 'player_url': embed_page_url}
1218 class EightTracksIE(InfoExtractor):
1220 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1222 def _real_extract(self, url):
1223 mobj = re.match(self._VALID_URL, url)
1225 raise ExtractorError(u'Invalid URL: %s' % url)
1226 playlist_id = mobj.group('id')
1228 webpage = self._download_webpage(url, playlist_id)
1230 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1231 data = json.loads(json_like)
1233 session = str(random.randint(0, 1000000000))
1235 track_count = data['tracks_count']
1236 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1237 next_url = first_url
1239 for i in itertools.count():
1240 api_json = self._download_webpage(next_url, playlist_id,
1241 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1242 errnote=u'Failed to download song information')
1243 api_data = json.loads(api_json)
1244 track_data = api_data[u'set']['track']
1246 'id': track_data['id'],
1247 'url': track_data['track_file_stream_url'],
1248 'title': track_data['performer'] + u' - ' + track_data['name'],
1249 'raw_title': track_data['name'],
1250 'uploader_id': data['user']['login'],
1254 if api_data['set']['at_last_track']:
1256 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1259 class KeekIE(InfoExtractor):
1260 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1263 def _real_extract(self, url):
1264 m = re.match(self._VALID_URL, url)
1265 video_id = m.group('videoID')
1267 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1268 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1269 webpage = self._download_webpage(url, video_id)
1271 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1274 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1275 webpage, u'uploader', fatal=False)
1281 'title': video_title,
1282 'thumbnail': thumbnail,
1283 'uploader': uploader
1287 class TEDIE(InfoExtractor):
1288 _VALID_URL=r'''http://www\.ted\.com/
1290 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1292 ((?P<type_talk>talks)) # We have a simple talk
1294 (/lang/(.*?))? # The url may contain the language
1295 /(?P<name>\w+) # Here goes the name and then ".html"
1299 def suitable(cls, url):
1300 """Receives a URL and returns True if suitable for this IE."""
1301 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1303 def _real_extract(self, url):
1304 m=re.match(self._VALID_URL, url, re.VERBOSE)
1305 if m.group('type_talk'):
1306 return [self._talk_info(url)]
1308 playlist_id=m.group('playlist_id')
1309 name=m.group('name')
1310 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1311 return [self._playlist_videos_info(url,name,playlist_id)]
1313 def _playlist_videos_info(self,url,name,playlist_id=0):
1314 '''Returns the videos of the playlist'''
1316 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1317 ([.\s]*?)data-playlist_item_id="(\d+)"
1318 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1320 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1321 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1322 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1323 m_names=re.finditer(video_name_RE,webpage)
1325 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1326 webpage, 'playlist title')
1328 playlist_entries = []
1329 for m_video, m_name in zip(m_videos,m_names):
1330 video_id=m_video.group('video_id')
1331 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1332 playlist_entries.append(self.url_result(talk_url, 'TED'))
1333 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1335 def _talk_info(self, url, video_id=0):
1336 """Return the video for the talk in the url"""
1337 m = re.match(self._VALID_URL, url,re.VERBOSE)
1338 video_name = m.group('name')
1339 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1340 self.report_extraction(video_name)
1341 # If the url includes the language we get the title translated
1342 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1344 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1345 webpage, 'json data')
1346 info = json.loads(json_data)
1347 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1348 webpage, 'description', flags = re.DOTALL)
1350 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1351 webpage, 'thumbnail')
1354 'url': info['htmlStreams'][-1]['file'],
1357 'thumbnail': thumbnail,
1358 'description': desc,
1362 class MySpassIE(InfoExtractor):
1363 _VALID_URL = r'http://www.myspass.de/.*'
1365 def _real_extract(self, url):
1366 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1368 # video id is the last path element of the URL
1369 # usually there is a trailing slash, so also try the second but last
1370 url_path = compat_urllib_parse_urlparse(url).path
1371 url_parent_path, video_id = os.path.split(url_path)
1373 _, video_id = os.path.split(url_parent_path)
1376 metadata_url = META_DATA_URL_TEMPLATE % video_id
1377 metadata_text = self._download_webpage(metadata_url, video_id)
1378 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1380 # extract values from metadata
1381 url_flv_el = metadata.find('url_flv')
1382 if url_flv_el is None:
1383 raise ExtractorError(u'Unable to extract download url')
1384 video_url = url_flv_el.text
1385 extension = os.path.splitext(video_url)[1][1:]
1386 title_el = metadata.find('title')
1387 if title_el is None:
1388 raise ExtractorError(u'Unable to extract title')
1389 title = title_el.text
1390 format_id_el = metadata.find('format_id')
1391 if format_id_el is None:
1394 format = format_id_el.text
1395 description_el = metadata.find('description')
1396 if description_el is not None:
1397 description = description_el.text
1400 imagePreview_el = metadata.find('imagePreview')
1401 if imagePreview_el is not None:
1402 thumbnail = imagePreview_el.text
1411 'thumbnail': thumbnail,
1412 'description': description
1416 class SpiegelIE(InfoExtractor):
1417 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1419 def _real_extract(self, url):
1420 m = re.match(self._VALID_URL, url)
1421 video_id = m.group('videoID')
1423 webpage = self._download_webpage(url, video_id)
1425 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1428 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1429 xml_code = self._download_webpage(xml_url, video_id,
1430 note=u'Downloading XML', errnote=u'Failed to download XML')
1432 idoc = xml.etree.ElementTree.fromstring(xml_code)
1433 last_type = idoc[-1]
1434 filename = last_type.findall('./filename')[0].text
1435 duration = float(last_type.findall('./duration')[0].text)
1437 video_url = 'http://video2.spiegel.de/flash/' + filename
1438 video_ext = filename.rpartition('.')[2]
1443 'title': video_title,
1444 'duration': duration,
1448 class LiveLeakIE(InfoExtractor):
1450 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1451 IE_NAME = u'liveleak'
1453 def _real_extract(self, url):
1454 mobj = re.match(self._VALID_URL, url)
1456 raise ExtractorError(u'Invalid URL: %s' % url)
1458 video_id = mobj.group('video_id')
1460 webpage = self._download_webpage(url, video_id)
1462 video_url = self._search_regex(r'file: "(.*?)",',
1463 webpage, u'video URL')
1465 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1466 webpage, u'title').replace('LiveLeak.com -', '').strip()
1468 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1469 webpage, u'description', fatal=False)
1471 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1472 webpage, u'uploader', fatal=False)
1478 'title': video_title,
1479 'description': video_description,
1480 'uploader': video_uploader
1487 class TumblrIE(InfoExtractor):
1488 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1490 def _real_extract(self, url):
1491 m_url = re.match(self._VALID_URL, url)
1492 video_id = m_url.group('id')
1493 blog = m_url.group('blog_name')
1495 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1496 webpage = self._download_webpage(url, video_id)
1498 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1499 video = re.search(re_video, webpage)
1501 raise ExtractorError(u'Unable to extract video')
1502 video_url = video.group('video_url')
1503 ext = video.group('ext')
1505 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1506 webpage, u'thumbnail', fatal=False) # We pick the first poster
1507 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1509 # The only place where you can get a title, it's not complete,
1510 # but searching in other places doesn't work for all videos
1511 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1512 webpage, u'title', flags=re.DOTALL)
1514 return [{'id': video_id,
1516 'title': video_title,
1517 'thumbnail': video_thumbnail,
1521 class BandcampIE(InfoExtractor):
1522 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1524 def _real_extract(self, url):
1525 mobj = re.match(self._VALID_URL, url)
1526 title = mobj.group('title')
1527 webpage = self._download_webpage(url, title)
1528 # We get the link to the free download page
1529 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1530 if m_download is None:
1531 raise ExtractorError(u'No free songs found')
1533 download_link = m_download.group(1)
1534 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1535 webpage, re.MULTILINE|re.DOTALL).group('id')
1537 download_webpage = self._download_webpage(download_link, id,
1538 'Downloading free downloads page')
1539 # We get the dictionary of the track from some javascrip code
1540 info = re.search(r'items: (.*?),$',
1541 download_webpage, re.MULTILINE).group(1)
1542 info = json.loads(info)[0]
1543 # We pick mp3-320 for now, until format selection can be easily implemented.
1544 mp3_info = info[u'downloads'][u'mp3-320']
1545 # If we try to use this url it says the link has expired
1546 initial_url = mp3_info[u'url']
1547 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1548 m_url = re.match(re_url, initial_url)
1549 #We build the url we will use to get the final track url
1550 # This url is build in Bandcamp in the script download_bunde_*.js
1551 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1552 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1553 # If we could correctly generate the .rand field the url would be
1554 #in the "download_url" key
1555 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1557 track_info = {'id':id,
1558 'title' : info[u'title'],
1561 'thumbnail' : info[u'thumb_url'],
1562 'uploader' : info[u'artist']
1567 class RedTubeIE(InfoExtractor):
1568 """Information Extractor for redtube"""
1569 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1571 def _real_extract(self,url):
1572 mobj = re.match(self._VALID_URL, url)
1574 raise ExtractorError(u'Invalid URL: %s' % url)
1576 video_id = mobj.group('id')
1577 video_extension = 'mp4'
1578 webpage = self._download_webpage(url, video_id)
1580 self.report_extraction(video_id)
1582 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1583 webpage, u'video URL')
1585 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1591 'ext': video_extension,
1592 'title': video_title,
1595 class InaIE(InfoExtractor):
1596 """Information Extractor for Ina.fr"""
1597 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1599 def _real_extract(self,url):
1600 mobj = re.match(self._VALID_URL, url)
1602 video_id = mobj.group('id')
1603 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1604 video_extension = 'mp4'
1605 webpage = self._download_webpage(mrss_url, video_id)
1607 self.report_extraction(video_id)
1609 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1610 webpage, u'video URL')
1612 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1618 'ext': video_extension,
1619 'title': video_title,
1622 class HowcastIE(InfoExtractor):
1623 """Information Extractor for Howcast.com"""
1624 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1626 def _real_extract(self, url):
1627 mobj = re.match(self._VALID_URL, url)
1629 video_id = mobj.group('id')
1630 webpage_url = 'http://www.howcast.com/videos/' + video_id
1631 webpage = self._download_webpage(webpage_url, video_id)
1633 self.report_extraction(video_id)
1635 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1636 webpage, u'video URL')
1638 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1641 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1642 webpage, u'description', fatal=False)
1644 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1645 webpage, u'thumbnail', fatal=False)
1651 'title': video_title,
1652 'description': video_description,
1653 'thumbnail': thumbnail,
1656 class VineIE(InfoExtractor):
1657 """Information Extractor for Vine.co"""
1658 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1660 def _real_extract(self, url):
1661 mobj = re.match(self._VALID_URL, url)
1663 video_id = mobj.group('id')
1664 webpage_url = 'https://vine.co/v/' + video_id
1665 webpage = self._download_webpage(webpage_url, video_id)
1667 self.report_extraction(video_id)
1669 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1670 webpage, u'video URL')
1672 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1675 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1676 webpage, u'thumbnail', fatal=False)
1678 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1679 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1685 'title': video_title,
1686 'thumbnail': thumbnail,
1687 'uploader': uploader,
1690 class FlickrIE(InfoExtractor):
1691 """Information Extractor for Flickr videos"""
1692 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1694 def _real_extract(self, url):
1695 mobj = re.match(self._VALID_URL, url)
1697 video_id = mobj.group('id')
1698 video_uploader_id = mobj.group('uploader_id')
1699 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1700 webpage = self._download_webpage(webpage_url, video_id)
1702 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1704 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1705 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1707 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1708 first_xml, u'node_id')
1710 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1711 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1713 self.report_extraction(video_id)
1715 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1717 raise ExtractorError(u'Unable to extract video url')
1718 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1720 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1721 webpage, u'video title')
1723 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1724 webpage, u'description', fatal=False)
1726 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1727 webpage, u'thumbnail', fatal=False)
1733 'title': video_title,
1734 'description': video_description,
1735 'thumbnail': thumbnail,
1736 'uploader_id': video_uploader_id,
1739 class TeamcocoIE(InfoExtractor):
1740 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1742 def _real_extract(self, url):
1743 mobj = re.match(self._VALID_URL, url)
1745 raise ExtractorError(u'Invalid URL: %s' % url)
1746 url_title = mobj.group('url_title')
1747 webpage = self._download_webpage(url, url_title)
1749 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1750 webpage, u'video id')
1752 self.report_extraction(video_id)
1754 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1757 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1758 webpage, u'thumbnail', fatal=False)
1760 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1761 webpage, u'description', fatal=False)
1763 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1764 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1766 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1773 'title': video_title,
1774 'thumbnail': thumbnail,
1775 'description': video_description,
1778 class XHamsterIE(InfoExtractor):
1779 """Information Extractor for xHamster"""
1780 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1782 def _real_extract(self,url):
1783 mobj = re.match(self._VALID_URL, url)
1785 video_id = mobj.group('id')
1786 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1787 webpage = self._download_webpage(mrss_url, video_id)
1789 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1791 raise ExtractorError(u'Unable to extract media URL')
1792 if len(mobj.group('server')) == 0:
1793 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1795 video_url = mobj.group('server')+'/key='+mobj.group('file')
1796 video_extension = video_url.split('.')[-1]
1798 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1801 # Can't see the description anywhere in the UI
1802 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1803 # webpage, u'description', fatal=False)
1804 # if video_description: video_description = unescapeHTML(video_description)
1806 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1808 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1810 video_upload_date = None
1811 self._downloader.report_warning(u'Unable to extract upload date')
1813 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1814 webpage, u'uploader id', default=u'anonymous')
1816 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1817 webpage, u'thumbnail', fatal=False)
1822 'ext': video_extension,
1823 'title': video_title,
1824 # 'description': video_description,
1825 'upload_date': video_upload_date,
1826 'uploader_id': video_uploader_id,
1827 'thumbnail': video_thumbnail
1830 class HypemIE(InfoExtractor):
1831 """Information Extractor for hypem"""
1832 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1834 def _real_extract(self, url):
1835 mobj = re.match(self._VALID_URL, url)
1837 raise ExtractorError(u'Invalid URL: %s' % url)
1838 track_id = mobj.group(1)
1840 data = { 'ax': 1, 'ts': time.time() }
1841 data_encoded = compat_urllib_parse.urlencode(data)
1842 complete_url = url + "?" + data_encoded
1843 request = compat_urllib_request.Request(complete_url)
1844 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1845 cookie = urlh.headers.get('Set-Cookie', '')
1847 self.report_extraction(track_id)
1849 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1850 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1852 track_list = json.loads(html_tracks)
1853 track = track_list[u'tracks'][0]
1855 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1858 track_id = track[u"id"]
1859 artist = track[u"artist"]
1860 title = track[u"song"]
1862 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1863 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1864 request.add_header('cookie', cookie)
1865 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1867 song_data = json.loads(song_data_json)
1869 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1870 final_url = song_data[u"url"]
1880 class Vbox7IE(InfoExtractor):
1881 """Information Extractor for Vbox7"""
1882 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1884 def _real_extract(self,url):
1885 mobj = re.match(self._VALID_URL, url)
1887 raise ExtractorError(u'Invalid URL: %s' % url)
1888 video_id = mobj.group(1)
1890 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1891 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1892 redirect_url = urlh.geturl() + new_location
1893 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1895 title = self._html_search_regex(r'<title>(.*)</title>',
1896 webpage, u'title').split('/')[0].strip()
1899 info_url = "http://vbox7.com/play/magare.do"
1900 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1901 info_request = compat_urllib_request.Request(info_url, data)
1902 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1903 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1904 if info_response is None:
1905 raise ExtractorError(u'Unable to extract the media url')
1906 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1913 'thumbnail': thumbnail_url,
1917 def gen_extractors():
1918 """ Return a list of an instance of every supported extractor.
1919 The order does matter; the first extractor matched is the one handling the URL.
1922 YoutubePlaylistIE(),
1947 StanfordOpenClassroomIE(),
1957 WorldStarHipHopIE(),
1987 def get_info_extractor(ie_name):
1988 """Returns the info extractor class with the given ie_name"""
1989 return globals()[ie_name+'IE']