10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.google import GoogleSearchIE
29 from .extractor.metacafe import MetacafeIE
30 from .extractor.myvideo import MyVideoIE
31 from .extractor.statigram import StatigramIE
32 from .extractor.photobucket import PhotobucketIE
33 from .extractor.vimeo import VimeoIE
34 from .extractor.yahoo import YahooIE, YahooSearchIE
35 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
36 from .extractor.zdf import ZDFIE
56 class DepositFilesIE(InfoExtractor):
57 """Information extractor for depositfiles.com"""
59 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
61 def _real_extract(self, url):
62 file_id = url.split('/')[-1]
63 # Rebuild url in english locale
64 url = 'http://depositfiles.com/en/files/' + file_id
66 # Retrieve file webpage with 'Free download' button pressed
67 free_download_indication = { 'gateway_result' : '1' }
68 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
70 self.report_download_webpage(file_id)
71 webpage = compat_urllib_request.urlopen(request).read()
72 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
73 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
75 # Search for the real file URL
76 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
77 if (mobj is None) or (mobj.group(1) is None):
78 # Try to figure out reason of the error.
79 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
80 if (mobj is not None) and (mobj.group(1) is not None):
81 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
82 raise ExtractorError(u'%s' % restriction_message)
84 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
86 file_url = mobj.group(1)
87 file_extension = os.path.splitext(file_url)[1][1:]
89 # Search for file title
90 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
93 'id': file_id.decode('utf-8'),
94 'url': file_url.decode('utf-8'),
98 'ext': file_extension.decode('utf-8'),
102 class FacebookIE(InfoExtractor):
103 """Information Extractor for Facebook"""
105 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
106 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
107 _NETRC_MACHINE = 'facebook'
108 IE_NAME = u'facebook'
110 def report_login(self):
111 """Report attempt to log in."""
112 self.to_screen(u'Logging in')
114 def _real_initialize(self):
115 if self._downloader is None:
120 downloader_params = self._downloader.params
122 # Attempt to use provided username and password or .netrc data
123 if downloader_params.get('username', None) is not None:
124 useremail = downloader_params['username']
125 password = downloader_params['password']
126 elif downloader_params.get('usenetrc', False):
128 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
133 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
134 except (IOError, netrc.NetrcParseError) as err:
135 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
138 if useremail is None:
147 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
150 login_results = compat_urllib_request.urlopen(request).read()
151 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
152 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
154 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
155 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
158 def _real_extract(self, url):
159 mobj = re.match(self._VALID_URL, url)
161 raise ExtractorError(u'Invalid URL: %s' % url)
162 video_id = mobj.group('ID')
164 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
165 webpage = self._download_webpage(url, video_id)
167 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
168 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
169 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
171 raise ExtractorError(u'Cannot parse data')
172 data = dict(json.loads(m.group(1)))
173 params_raw = compat_urllib_parse.unquote(data['params'])
174 params = json.loads(params_raw)
175 video_data = params['video_data'][0]
176 video_url = video_data.get('hd_src')
178 video_url = video_data['sd_src']
180 raise ExtractorError(u'Cannot find video URL')
181 video_duration = int(video_data['video_duration'])
182 thumbnail = video_data['thumbnail_src']
184 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
189 'title': video_title,
192 'duration': video_duration,
193 'thumbnail': thumbnail,
203 class EscapistIE(InfoExtractor):
204 """Information extractor for The Escapist """
206 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
207 IE_NAME = u'escapist'
209 def _real_extract(self, url):
210 mobj = re.match(self._VALID_URL, url)
212 raise ExtractorError(u'Invalid URL: %s' % url)
213 showName = mobj.group('showname')
214 videoId = mobj.group('episode')
216 self.report_extraction(videoId)
217 webpage = self._download_webpage(url, videoId)
219 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
220 webpage, u'description', fatal=False)
222 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
223 webpage, u'thumbnail', fatal=False)
225 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
226 webpage, u'player url')
228 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
229 webpage, u'player url').split(' : ')[-1]
231 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
232 configUrl = compat_urllib_parse.unquote(configUrl)
234 configJSON = self._download_webpage(configUrl, videoId,
235 u'Downloading configuration',
236 u'unable to download configuration')
238 # Technically, it's JavaScript, not JSON
239 configJSON = configJSON.replace("'", '"')
242 config = json.loads(configJSON)
243 except (ValueError,) as err:
244 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
246 playlist = config['playlist']
247 videoUrl = playlist[1]['url']
252 'uploader': showName,
257 'description': videoDesc,
258 'player_url': playerUrl,
263 class CollegeHumorIE(InfoExtractor):
264 """Information extractor for collegehumor.com"""
267 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
268 IE_NAME = u'collegehumor'
270 def report_manifest(self, video_id):
271 """Report information extraction."""
272 self.to_screen(u'%s: Downloading XML manifest' % video_id)
274 def _real_extract(self, url):
275 mobj = re.match(self._VALID_URL, url)
277 raise ExtractorError(u'Invalid URL: %s' % url)
278 video_id = mobj.group('videoid')
286 self.report_extraction(video_id)
287 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
289 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
290 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
293 mdoc = xml.etree.ElementTree.fromstring(metaXml)
295 videoNode = mdoc.findall('./video')[0]
296 info['description'] = videoNode.findall('./description')[0].text
297 info['title'] = videoNode.findall('./caption')[0].text
298 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
299 manifest_url = videoNode.findall('./file')[0].text
301 raise ExtractorError(u'Invalid metadata XML file')
303 manifest_url += '?hdcore=2.10.3'
304 self.report_manifest(video_id)
306 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
308 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
310 adoc = xml.etree.ElementTree.fromstring(manifestXml)
312 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
313 node_id = media_node.attrib['url']
314 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
315 except IndexError as err:
316 raise ExtractorError(u'Invalid manifest file')
318 url_pr = compat_urllib_parse_urlparse(manifest_url)
319 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
326 class XVideosIE(InfoExtractor):
327 """Information extractor for xvideos.com"""
329 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
332 def _real_extract(self, url):
333 mobj = re.match(self._VALID_URL, url)
335 raise ExtractorError(u'Invalid URL: %s' % url)
336 video_id = mobj.group(1)
338 webpage = self._download_webpage(url, video_id)
340 self.report_extraction(video_id)
343 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
344 webpage, u'video URL'))
347 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
350 # Extract video thumbnail
351 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
352 webpage, u'thumbnail', fatal=False)
359 'title': video_title,
361 'thumbnail': video_thumbnail,
368 class SoundcloudIE(InfoExtractor):
369 """Information extractor for soundcloud.com
370 To access the media, the uid of the song and a stream token
371 must be extracted from the page source and the script must make
372 a request to media.soundcloud.com/crossdomain.xml. Then
373 the media can be grabbed by requesting from an url composed
374 of the stream token and uid
377 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
378 IE_NAME = u'soundcloud'
380 def report_resolve(self, video_id):
381 """Report information extraction."""
382 self.to_screen(u'%s: Resolving id' % video_id)
384 def _real_extract(self, url):
385 mobj = re.match(self._VALID_URL, url)
387 raise ExtractorError(u'Invalid URL: %s' % url)
389 # extract uploader (which is in the url)
390 uploader = mobj.group(1)
391 # extract simple title (uploader + slug of song title)
392 slug_title = mobj.group(2)
393 simple_title = uploader + u'-' + slug_title
394 full_title = '%s/%s' % (uploader, slug_title)
396 self.report_resolve(full_title)
398 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
399 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
400 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
402 info = json.loads(info_json)
403 video_id = info['id']
404 self.report_extraction(full_title)
406 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
407 stream_json = self._download_webpage(streams_url, full_title,
408 u'Downloading stream definitions',
409 u'unable to download stream definitions')
411 streams = json.loads(stream_json)
412 mediaURL = streams['http_mp3_128_url']
413 upload_date = unified_strdate(info['created_at'])
418 'uploader': info['user']['username'],
419 'upload_date': upload_date,
420 'title': info['title'],
422 'description': info['description'],
425 class SoundcloudSetIE(InfoExtractor):
426 """Information extractor for soundcloud.com sets
427 To access the media, the uid of the song and a stream token
428 must be extracted from the page source and the script must make
429 a request to media.soundcloud.com/crossdomain.xml. Then
430 the media can be grabbed by requesting from an url composed
431 of the stream token and uid
434 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
435 IE_NAME = u'soundcloud:set'
437 def report_resolve(self, video_id):
438 """Report information extraction."""
439 self.to_screen(u'%s: Resolving id' % video_id)
441 def _real_extract(self, url):
442 mobj = re.match(self._VALID_URL, url)
444 raise ExtractorError(u'Invalid URL: %s' % url)
446 # extract uploader (which is in the url)
447 uploader = mobj.group(1)
448 # extract simple title (uploader + slug of song title)
449 slug_title = mobj.group(2)
450 simple_title = uploader + u'-' + slug_title
451 full_title = '%s/sets/%s' % (uploader, slug_title)
453 self.report_resolve(full_title)
455 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
456 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
457 info_json = self._download_webpage(resolv_url, full_title)
460 info = json.loads(info_json)
462 for err in info['errors']:
463 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
466 self.report_extraction(full_title)
467 for track in info['tracks']:
468 video_id = track['id']
470 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
471 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
473 self.report_extraction(video_id)
474 streams = json.loads(stream_json)
475 mediaURL = streams['http_mp3_128_url']
480 'uploader': track['user']['username'],
481 'upload_date': unified_strdate(track['created_at']),
482 'title': track['title'],
484 'description': track['description'],
489 class InfoQIE(InfoExtractor):
490 """Information extractor for infoq.com"""
491 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
493 def _real_extract(self, url):
494 mobj = re.match(self._VALID_URL, url)
496 raise ExtractorError(u'Invalid URL: %s' % url)
498 webpage = self._download_webpage(url, video_id=url)
499 self.report_extraction(url)
502 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
504 raise ExtractorError(u'Unable to extract video url')
505 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
506 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
509 video_title = self._search_regex(r'contentTitle = "(.*?)";',
512 # Extract description
513 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
514 webpage, u'description', fatal=False)
516 video_filename = video_url.split('/')[-1]
517 video_id, extension = video_filename.split('.')
524 'title': video_title,
525 'ext': extension, # Extension is always(?) mp4, but seems to be flv
527 'description': video_description,
532 class MixcloudIE(InfoExtractor):
533 """Information extractor for www.mixcloud.com"""
535 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
536 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
537 IE_NAME = u'mixcloud'
539 def report_download_json(self, file_id):
540 """Report JSON download."""
541 self.to_screen(u'Downloading json')
543 def get_urls(self, jsonData, fmt, bitrate='best'):
544 """Get urls from 'audio_formats' section in json"""
547 bitrate_list = jsonData[fmt]
548 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
549 bitrate = max(bitrate_list) # select highest
551 url_list = jsonData[fmt][bitrate]
552 except TypeError: # we have no bitrate info.
553 url_list = jsonData[fmt]
556 def check_urls(self, url_list):
557 """Returns 1st active url from list"""
560 compat_urllib_request.urlopen(url)
562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
567 def _print_formats(self, formats):
568 print('Available formats:')
569 for fmt in formats.keys():
570 for b in formats[fmt]:
572 ext = formats[fmt][b][0]
573 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
574 except TypeError: # we have no bitrate info
575 ext = formats[fmt][0]
576 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
579 def _real_extract(self, url):
580 mobj = re.match(self._VALID_URL, url)
582 raise ExtractorError(u'Invalid URL: %s' % url)
583 # extract uploader & filename from url
584 uploader = mobj.group(1).decode('utf-8')
585 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
587 # construct API request
588 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
589 # retrieve .json file with links to files
590 request = compat_urllib_request.Request(file_url)
592 self.report_download_json(file_url)
593 jsonData = compat_urllib_request.urlopen(request).read()
594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
595 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
598 json_data = json.loads(jsonData)
599 player_url = json_data['player_swf_url']
600 formats = dict(json_data['audio_formats'])
602 req_format = self._downloader.params.get('format', None)
605 if self._downloader.params.get('listformats', None):
606 self._print_formats(formats)
609 if req_format is None or req_format == 'best':
610 for format_param in formats.keys():
611 url_list = self.get_urls(formats, format_param)
613 file_url = self.check_urls(url_list)
614 if file_url is not None:
617 if req_format not in formats:
618 raise ExtractorError(u'Format is not available')
620 url_list = self.get_urls(formats, req_format)
621 file_url = self.check_urls(url_list)
622 format_param = req_format
625 'id': file_id.decode('utf-8'),
626 'url': file_url.decode('utf-8'),
627 'uploader': uploader.decode('utf-8'),
629 'title': json_data['name'],
630 'ext': file_url.split('.')[-1].decode('utf-8'),
631 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
632 'thumbnail': json_data['thumbnail_url'],
633 'description': json_data['description'],
634 'player_url': player_url.decode('utf-8'),
637 class StanfordOpenClassroomIE(InfoExtractor):
638 """Information extractor for Stanford's Open ClassRoom"""
640 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
641 IE_NAME = u'stanfordoc'
643 def _real_extract(self, url):
644 mobj = re.match(self._VALID_URL, url)
646 raise ExtractorError(u'Invalid URL: %s' % url)
648 if mobj.group('course') and mobj.group('video'): # A specific video
649 course = mobj.group('course')
650 video = mobj.group('video')
652 'id': course + '_' + video,
657 self.report_extraction(info['id'])
658 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
659 xmlUrl = baseUrl + video + '.xml'
661 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
663 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
664 mdoc = xml.etree.ElementTree.fromstring(metaXml)
666 info['title'] = mdoc.findall('./title')[0].text
667 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
669 raise ExtractorError(u'Invalid metadata XML file')
670 info['ext'] = info['url'].rpartition('.')[2]
672 elif mobj.group('course'): # A course page
673 course = mobj.group('course')
681 coursepage = self._download_webpage(url, info['id'],
682 note='Downloading course info page',
683 errnote='Unable to download course info page')
685 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
687 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
688 coursepage, u'description', fatal=False)
690 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
694 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
698 for entry in info['list']:
699 assert entry['type'] == 'reference'
700 results += self.extract(entry['url'])
704 'id': 'Stanford OpenClassroom',
710 self.report_download_webpage(info['id'])
711 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
713 rootpage = compat_urllib_request.urlopen(rootURL).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
717 info['title'] = info['id']
719 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
723 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
728 for entry in info['list']:
729 assert entry['type'] == 'reference'
730 results += self.extract(entry['url'])
733 class MTVIE(InfoExtractor):
734 """Information extractor for MTV.com"""
736 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
739 def _real_extract(self, url):
740 mobj = re.match(self._VALID_URL, url)
742 raise ExtractorError(u'Invalid URL: %s' % url)
743 if not mobj.group('proto'):
744 url = 'http://' + url
745 video_id = mobj.group('videoid')
747 webpage = self._download_webpage(url, video_id)
749 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
750 webpage, u'song name', fatal=False)
752 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
755 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
756 webpage, u'mtvn_uri', fatal=False)
758 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
759 webpage, u'content id', fatal=False)
761 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
762 self.report_extraction(video_id)
763 request = compat_urllib_request.Request(videogen_url)
765 metadataXml = compat_urllib_request.urlopen(request).read()
766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
767 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
769 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
770 renditions = mdoc.findall('.//rendition')
772 # For now, always pick the highest quality.
773 rendition = renditions[-1]
776 _,_,ext = rendition.attrib['type'].partition('/')
777 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
778 video_url = rendition.find('./src').text
780 raise ExtractorError('Invalid rendition field.')
785 'uploader': performer,
787 'title': video_title,
795 class YoukuIE(InfoExtractor):
796 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
799 nowTime = int(time.time() * 1000)
800 random1 = random.randint(1000,1998)
801 random2 = random.randint(1000,9999)
803 return "%d%d%d" %(nowTime,random1,random2)
805 def _get_file_ID_mix_string(self, seed):
807 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
809 for i in range(len(source)):
810 seed = (seed * 211 + 30031 ) % 65536
811 index = math.floor(seed / 65536 * len(source) )
812 mixed.append(source[int(index)])
813 source.remove(source[int(index)])
814 #return ''.join(mixed)
817 def _get_file_id(self, fileId, seed):
818 mixed = self._get_file_ID_mix_string(seed)
819 ids = fileId.split('*')
823 realId.append(mixed[int(ch)])
824 return ''.join(realId)
826 def _real_extract(self, url):
827 mobj = re.match(self._VALID_URL, url)
829 raise ExtractorError(u'Invalid URL: %s' % url)
830 video_id = mobj.group('ID')
832 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
834 jsondata = self._download_webpage(info_url, video_id)
836 self.report_extraction(video_id)
838 config = json.loads(jsondata)
840 video_title = config['data'][0]['title']
841 seed = config['data'][0]['seed']
843 format = self._downloader.params.get('format', None)
844 supported_format = list(config['data'][0]['streamfileids'].keys())
846 if format is None or format == 'best':
847 if 'hd2' in supported_format:
852 elif format == 'worst':
860 fileid = config['data'][0]['streamfileids'][format]
861 keys = [s['k'] for s in config['data'][0]['segs'][format]]
862 except (UnicodeDecodeError, ValueError, KeyError):
863 raise ExtractorError(u'Unable to extract info section')
866 sid = self._gen_sid()
867 fileid = self._get_file_id(fileid, seed)
869 #column 8,9 of fileid represent the segment number
870 #fileid[7:9] should be changed
871 for index, key in enumerate(keys):
873 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
874 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
877 'id': '%s_part%02d' % (video_id, index),
881 'title': video_title,
884 files_info.append(info)
889 class XNXXIE(InfoExtractor):
890 """Information extractor for xnxx.com"""
892 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
894 VIDEO_URL_RE = r'flv_url=(.*?)&'
895 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
896 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
898 def _real_extract(self, url):
899 mobj = re.match(self._VALID_URL, url)
901 raise ExtractorError(u'Invalid URL: %s' % url)
902 video_id = mobj.group(1)
904 # Get webpage content
905 webpage = self._download_webpage(url, video_id)
907 video_url = self._search_regex(self.VIDEO_URL_RE,
908 webpage, u'video URL')
909 video_url = compat_urllib_parse.unquote(video_url)
911 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
914 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
915 webpage, u'thumbnail', fatal=False)
922 'title': video_title,
924 'thumbnail': video_thumbnail,
929 class GooglePlusIE(InfoExtractor):
930 """Information extractor for plus.google.com."""
932 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
933 IE_NAME = u'plus.google'
935 def _real_extract(self, url):
936 # Extract id from URL
937 mobj = re.match(self._VALID_URL, url)
939 raise ExtractorError(u'Invalid URL: %s' % url)
941 post_url = mobj.group(0)
942 video_id = mobj.group(1)
944 video_extension = 'flv'
946 # Step 1, Retrieve post webpage to extract further information
947 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
949 self.report_extraction(video_id)
951 # Extract update date
952 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
953 webpage, u'upload date', fatal=False)
955 # Convert timestring to a format suitable for filename
956 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
957 upload_date = upload_date.strftime('%Y%m%d')
960 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
961 webpage, u'uploader', fatal=False)
964 # Get the first line for title
965 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
966 webpage, 'title', default=u'NA')
968 # Step 2, Stimulate clicking the image box to launch video
969 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
970 webpage, u'video page URL')
971 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
973 # Extract video links on video page
974 """Extract video links of all sizes"""
975 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
976 mobj = re.findall(pattern, webpage)
978 raise ExtractorError(u'Unable to extract video links')
983 # Choose the lowest of the sort, i.e. highest resolution
984 video_url = links[-1]
985 # Only get the url. The resolution part in the tuple has no use anymore
986 video_url = video_url[-1]
987 # Treat escaped \u0026 style hex
989 video_url = video_url.decode("unicode_escape")
990 except AttributeError: # Python 3
991 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
997 'uploader': uploader,
998 'upload_date': upload_date,
999 'title': video_title,
1000 'ext': video_extension,
1003 class NBAIE(InfoExtractor):
1004 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1007 def _real_extract(self, url):
1008 mobj = re.match(self._VALID_URL, url)
1010 raise ExtractorError(u'Invalid URL: %s' % url)
1012 video_id = mobj.group(1)
1014 webpage = self._download_webpage(url, video_id)
1016 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1018 shortened_video_id = video_id.rpartition('/')[2]
1019 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1020 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1022 # It isn't there in the HTML it returns to us
1023 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1025 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1028 'id': shortened_video_id,
1032 # 'uploader_date': uploader_date,
1033 'description': description,
1037 class JustinTVIE(InfoExtractor):
1038 """Information extractor for justin.tv and twitch.tv"""
1039 # TODO: One broadcast may be split into multiple videos. The key
1040 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1041 # starts at 1 and increases. Can we treat all parts as one video?
1043 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1045 (?P<channelid>[^/]+)|
1046 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1047 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1051 _JUSTIN_PAGE_LIMIT = 100
1052 IE_NAME = u'justin.tv'
1054 def report_download_page(self, channel, offset):
1055 """Report attempt to download a single page of videos."""
1056 self.to_screen(u'%s: Downloading video information from %d to %d' %
1057 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1059 # Return count of items, list of *valid* items
1060 def _parse_page(self, url, video_id):
1061 webpage = self._download_webpage(url, video_id,
1062 u'Downloading video info JSON',
1063 u'unable to download video info JSON')
1065 response = json.loads(webpage)
1066 if type(response) != list:
1067 error_text = response.get('error', 'unknown error')
1068 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1070 for clip in response:
1071 video_url = clip['video_file_url']
1073 video_extension = os.path.splitext(video_url)[1][1:]
1074 video_date = re.sub('-', '', clip['start_time'][:10])
1075 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1076 video_id = clip['id']
1077 video_title = clip.get('title', video_id)
1081 'title': video_title,
1082 'uploader': clip.get('channel_name', video_uploader_id),
1083 'uploader_id': video_uploader_id,
1084 'upload_date': video_date,
1085 'ext': video_extension,
1087 return (len(response), info)
1089 def _real_extract(self, url):
1090 mobj = re.match(self._VALID_URL, url)
1092 raise ExtractorError(u'invalid URL: %s' % url)
1094 api_base = 'http://api.justin.tv'
1096 if mobj.group('channelid'):
1098 video_id = mobj.group('channelid')
1099 api = api_base + '/channel/archives/%s.json' % video_id
1100 elif mobj.group('chapterid'):
1101 chapter_id = mobj.group('chapterid')
1103 webpage = self._download_webpage(url, chapter_id)
1104 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1106 raise ExtractorError(u'Cannot find archive of a chapter')
1107 archive_id = m.group(1)
1109 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1110 chapter_info_xml = self._download_webpage(api, chapter_id,
1111 note=u'Downloading chapter information',
1112 errnote=u'Chapter information download failed')
1113 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1114 for a in doc.findall('.//archive'):
1115 if archive_id == a.find('./id').text:
1118 raise ExtractorError(u'Could not find chapter in chapter information')
1120 video_url = a.find('./video_file_url').text
1121 video_ext = video_url.rpartition('.')[2] or u'flv'
1123 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1124 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1125 note='Downloading chapter metadata',
1126 errnote='Download of chapter metadata failed')
1127 chapter_info = json.loads(chapter_info_json)
1129 bracket_start = int(doc.find('.//bracket_start').text)
1130 bracket_end = int(doc.find('.//bracket_end').text)
1132 # TODO determine start (and probably fix up file)
1133 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1134 #video_url += u'?start=' + TODO:start_timestamp
1135 # bracket_start is 13290, but we want 51670615
1136 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1137 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1140 'id': u'c' + chapter_id,
1143 'title': chapter_info['title'],
1144 'thumbnail': chapter_info['preview'],
1145 'description': chapter_info['description'],
1146 'uploader': chapter_info['channel']['display_name'],
1147 'uploader_id': chapter_info['channel']['name'],
1151 video_id = mobj.group('videoid')
1152 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1154 self.report_extraction(video_id)
1158 limit = self._JUSTIN_PAGE_LIMIT
1161 self.report_download_page(video_id, offset)
1162 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1163 page_count, page_info = self._parse_page(page_url, video_id)
1164 info.extend(page_info)
1165 if not paged or page_count != limit:
1170 class FunnyOrDieIE(InfoExtractor):
1171 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1173 def _real_extract(self, url):
1174 mobj = re.match(self._VALID_URL, url)
1176 raise ExtractorError(u'invalid URL: %s' % url)
1178 video_id = mobj.group('id')
1179 webpage = self._download_webpage(url, video_id)
1181 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1182 webpage, u'video URL', flags=re.DOTALL)
1184 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1185 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1187 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1188 webpage, u'description', fatal=False, flags=re.DOTALL)
1195 'description': video_description,
1199 class SteamIE(InfoExtractor):
1200 _VALID_URL = r"""http://store\.steampowered\.com/
1202 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1204 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1206 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1207 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1210 def suitable(cls, url):
1211 """Receives a URL and returns True if suitable for this IE."""
1212 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1214 def _real_extract(self, url):
1215 m = re.match(self._VALID_URL, url, re.VERBOSE)
1216 gameID = m.group('gameID')
1218 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1219 webpage = self._download_webpage(videourl, gameID)
1221 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1222 videourl = self._AGECHECK_TEMPLATE % gameID
1223 self.report_age_confirmation()
1224 webpage = self._download_webpage(videourl, gameID)
1226 self.report_extraction(gameID)
1227 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1228 webpage, 'game title')
1230 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1231 mweb = re.finditer(urlRE, webpage)
1232 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1233 titles = re.finditer(namesRE, webpage)
1234 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1235 thumbs = re.finditer(thumbsRE, webpage)
1237 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1238 video_id = vid.group('videoID')
1239 title = vtitle.group('videoName')
1240 video_url = vid.group('videoURL')
1241 video_thumb = thumb.group('thumbnail')
1243 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1248 'title': unescapeHTML(title),
1249 'thumbnail': video_thumb
1252 return [self.playlist_result(videos, gameID, game_title)]
1254 class UstreamIE(InfoExtractor):
1255 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1256 IE_NAME = u'ustream'
1258 def _real_extract(self, url):
1259 m = re.match(self._VALID_URL, url)
1260 video_id = m.group('videoID')
1262 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1263 webpage = self._download_webpage(url, video_id)
1265 self.report_extraction(video_id)
1267 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1270 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1271 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1273 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1274 webpage, u'thumbnail', fatal=False)
1280 'title': video_title,
1281 'uploader': uploader,
1282 'thumbnail': thumbnail,
1286 class WorldStarHipHopIE(InfoExtractor):
1287 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1288 IE_NAME = u'WorldStarHipHop'
1290 def _real_extract(self, url):
1291 m = re.match(self._VALID_URL, url)
1292 video_id = m.group('id')
1294 webpage_src = self._download_webpage(url, video_id)
1296 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1297 webpage_src, u'video URL')
1299 if 'mp4' in video_url:
1304 video_title = self._html_search_regex(r"<title>(.*)</title>",
1305 webpage_src, u'title')
1307 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1308 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1309 webpage_src, u'thumbnail', fatal=False)
1312 _title = r"""candytitles.*>(.*)</span>"""
1313 mobj = re.search(_title, webpage_src)
1314 if mobj is not None:
1315 video_title = mobj.group(1)
1320 'title' : video_title,
1321 'thumbnail' : thumbnail,
1326 class RBMARadioIE(InfoExtractor):
1327 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1329 def _real_extract(self, url):
1330 m = re.match(self._VALID_URL, url)
1331 video_id = m.group('videoID')
1333 webpage = self._download_webpage(url, video_id)
1335 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1336 webpage, u'json data', flags=re.MULTILINE)
1339 data = json.loads(json_data)
1340 except ValueError as e:
1341 raise ExtractorError(u'Invalid JSON: ' + str(e))
1343 video_url = data['akamai_url'] + '&cbr=256'
1344 url_parts = compat_urllib_parse_urlparse(video_url)
1345 video_ext = url_parts.path.rpartition('.')[2]
1350 'title': data['title'],
1351 'description': data.get('teaser_text'),
1352 'location': data.get('country_of_origin'),
1353 'uploader': data.get('host', {}).get('name'),
1354 'uploader_id': data.get('host', {}).get('slug'),
1355 'thumbnail': data.get('image', {}).get('large_url_2x'),
1356 'duration': data.get('duration'),
1361 class YouPornIE(InfoExtractor):
1362 """Information extractor for youporn.com."""
1363 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1365 def _print_formats(self, formats):
1366 """Print all available formats"""
1367 print(u'Available formats:')
1368 print(u'ext\t\tformat')
1369 print(u'---------------------------------')
1370 for format in formats:
1371 print(u'%s\t\t%s' % (format['ext'], format['format']))
1373 def _specific(self, req_format, formats):
1375 if(x["format"]==req_format):
1379 def _real_extract(self, url):
1380 mobj = re.match(self._VALID_URL, url)
1382 raise ExtractorError(u'Invalid URL: %s' % url)
1383 video_id = mobj.group('videoid')
1385 req = compat_urllib_request.Request(url)
1386 req.add_header('Cookie', 'age_verified=1')
1387 webpage = self._download_webpage(req, video_id)
1389 # Get JSON parameters
1390 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1392 params = json.loads(json_params)
1394 raise ExtractorError(u'Invalid JSON')
1396 self.report_extraction(video_id)
1398 video_title = params['title']
1399 upload_date = unified_strdate(params['release_date_f'])
1400 video_description = params['description']
1401 video_uploader = params['submitted_by']
1402 thumbnail = params['thumbnails'][0]['image']
1404 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1406 # Get all of the formats available
1407 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1408 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1409 webpage, u'download list').strip()
1411 # Get all of the links from the page
1412 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1413 links = re.findall(LINK_RE, download_list_html)
1414 if(len(links) == 0):
1415 raise ExtractorError(u'ERROR: no known formats available for video')
1417 self.to_screen(u'Links found: %d' % len(links))
1422 # A link looks like this:
1423 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1424 # A path looks like this:
1425 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1426 video_url = unescapeHTML( link )
1427 path = compat_urllib_parse_urlparse( video_url ).path
1428 extension = os.path.splitext( path )[1][1:]
1429 format = path.split('/')[4].split('_')[:2]
1432 format = "-".join( format )
1433 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1438 'uploader': video_uploader,
1439 'upload_date': upload_date,
1440 'title': video_title,
1443 'thumbnail': thumbnail,
1444 'description': video_description
1447 if self._downloader.params.get('listformats', None):
1448 self._print_formats(formats)
1451 req_format = self._downloader.params.get('format', None)
1452 self.to_screen(u'Format: %s' % req_format)
1454 if req_format is None or req_format == 'best':
1456 elif req_format == 'worst':
1457 return [formats[-1]]
1458 elif req_format in ('-1', 'all'):
1461 format = self._specific( req_format, formats )
1463 raise ExtractorError(u'Requested format not available')
1468 class PornotubeIE(InfoExtractor):
1469 """Information extractor for pornotube.com."""
1470 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1472 def _real_extract(self, url):
1473 mobj = re.match(self._VALID_URL, url)
1475 raise ExtractorError(u'Invalid URL: %s' % url)
1477 video_id = mobj.group('videoid')
1478 video_title = mobj.group('title')
1480 # Get webpage content
1481 webpage = self._download_webpage(url, video_id)
1484 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1485 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1486 video_url = compat_urllib_parse.unquote(video_url)
1488 #Get the uploaded date
1489 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1490 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1491 if upload_date: upload_date = unified_strdate(upload_date)
1493 info = {'id': video_id,
1496 'upload_date': upload_date,
1497 'title': video_title,
1503 class YouJizzIE(InfoExtractor):
1504 """Information extractor for youjizz.com."""
1505 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1507 def _real_extract(self, url):
1508 mobj = re.match(self._VALID_URL, url)
1510 raise ExtractorError(u'Invalid URL: %s' % url)
1512 video_id = mobj.group('videoid')
1514 # Get webpage content
1515 webpage = self._download_webpage(url, video_id)
1517 # Get the video title
1518 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1519 webpage, u'title').strip()
1521 # Get the embed page
1522 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1524 raise ExtractorError(u'ERROR: unable to extract embed page')
1526 embed_page_url = result.group(0).strip()
1527 video_id = result.group('videoid')
1529 webpage = self._download_webpage(embed_page_url, video_id)
1532 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1533 webpage, u'video URL')
1535 info = {'id': video_id,
1537 'title': video_title,
1540 'player_url': embed_page_url}
1544 class EightTracksIE(InfoExtractor):
1546 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1548 def _real_extract(self, url):
1549 mobj = re.match(self._VALID_URL, url)
1551 raise ExtractorError(u'Invalid URL: %s' % url)
1552 playlist_id = mobj.group('id')
1554 webpage = self._download_webpage(url, playlist_id)
1556 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1557 data = json.loads(json_like)
1559 session = str(random.randint(0, 1000000000))
1561 track_count = data['tracks_count']
1562 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1563 next_url = first_url
1565 for i in itertools.count():
1566 api_json = self._download_webpage(next_url, playlist_id,
1567 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1568 errnote=u'Failed to download song information')
1569 api_data = json.loads(api_json)
1570 track_data = api_data[u'set']['track']
1572 'id': track_data['id'],
1573 'url': track_data['track_file_stream_url'],
1574 'title': track_data['performer'] + u' - ' + track_data['name'],
1575 'raw_title': track_data['name'],
1576 'uploader_id': data['user']['login'],
1580 if api_data['set']['at_last_track']:
1582 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1585 class KeekIE(InfoExtractor):
1586 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1589 def _real_extract(self, url):
1590 m = re.match(self._VALID_URL, url)
1591 video_id = m.group('videoID')
1593 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1594 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1595 webpage = self._download_webpage(url, video_id)
1597 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1600 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1601 webpage, u'uploader', fatal=False)
1607 'title': video_title,
1608 'thumbnail': thumbnail,
1609 'uploader': uploader
1613 class TEDIE(InfoExtractor):
1614 _VALID_URL=r'''http://www\.ted\.com/
1616 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1618 ((?P<type_talk>talks)) # We have a simple talk
1620 (/lang/(.*?))? # The url may contain the language
1621 /(?P<name>\w+) # Here goes the name and then ".html"
1625 def suitable(cls, url):
1626 """Receives a URL and returns True if suitable for this IE."""
1627 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1629 def _real_extract(self, url):
1630 m=re.match(self._VALID_URL, url, re.VERBOSE)
1631 if m.group('type_talk'):
1632 return [self._talk_info(url)]
1634 playlist_id=m.group('playlist_id')
1635 name=m.group('name')
1636 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1637 return [self._playlist_videos_info(url,name,playlist_id)]
1639 def _playlist_videos_info(self,url,name,playlist_id=0):
1640 '''Returns the videos of the playlist'''
1642 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1643 ([.\s]*?)data-playlist_item_id="(\d+)"
1644 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1646 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1647 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1648 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1649 m_names=re.finditer(video_name_RE,webpage)
1651 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1652 webpage, 'playlist title')
1654 playlist_entries = []
1655 for m_video, m_name in zip(m_videos,m_names):
1656 video_id=m_video.group('video_id')
1657 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1658 playlist_entries.append(self.url_result(talk_url, 'TED'))
1659 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1661 def _talk_info(self, url, video_id=0):
1662 """Return the video for the talk in the url"""
1663 m = re.match(self._VALID_URL, url,re.VERBOSE)
1664 video_name = m.group('name')
1665 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1666 self.report_extraction(video_name)
1667 # If the url includes the language we get the title translated
1668 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1670 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1671 webpage, 'json data')
1672 info = json.loads(json_data)
1673 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1674 webpage, 'description', flags = re.DOTALL)
1676 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1677 webpage, 'thumbnail')
1680 'url': info['htmlStreams'][-1]['file'],
1683 'thumbnail': thumbnail,
1684 'description': desc,
1688 class MySpassIE(InfoExtractor):
1689 _VALID_URL = r'http://www.myspass.de/.*'
1691 def _real_extract(self, url):
1692 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1694 # video id is the last path element of the URL
1695 # usually there is a trailing slash, so also try the second but last
1696 url_path = compat_urllib_parse_urlparse(url).path
1697 url_parent_path, video_id = os.path.split(url_path)
1699 _, video_id = os.path.split(url_parent_path)
1702 metadata_url = META_DATA_URL_TEMPLATE % video_id
1703 metadata_text = self._download_webpage(metadata_url, video_id)
1704 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1706 # extract values from metadata
1707 url_flv_el = metadata.find('url_flv')
1708 if url_flv_el is None:
1709 raise ExtractorError(u'Unable to extract download url')
1710 video_url = url_flv_el.text
1711 extension = os.path.splitext(video_url)[1][1:]
1712 title_el = metadata.find('title')
1713 if title_el is None:
1714 raise ExtractorError(u'Unable to extract title')
1715 title = title_el.text
1716 format_id_el = metadata.find('format_id')
1717 if format_id_el is None:
1720 format = format_id_el.text
1721 description_el = metadata.find('description')
1722 if description_el is not None:
1723 description = description_el.text
1726 imagePreview_el = metadata.find('imagePreview')
1727 if imagePreview_el is not None:
1728 thumbnail = imagePreview_el.text
1737 'thumbnail': thumbnail,
1738 'description': description
1742 class SpiegelIE(InfoExtractor):
1743 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1745 def _real_extract(self, url):
1746 m = re.match(self._VALID_URL, url)
1747 video_id = m.group('videoID')
1749 webpage = self._download_webpage(url, video_id)
1751 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1754 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1755 xml_code = self._download_webpage(xml_url, video_id,
1756 note=u'Downloading XML', errnote=u'Failed to download XML')
1758 idoc = xml.etree.ElementTree.fromstring(xml_code)
1759 last_type = idoc[-1]
1760 filename = last_type.findall('./filename')[0].text
1761 duration = float(last_type.findall('./duration')[0].text)
1763 video_url = 'http://video2.spiegel.de/flash/' + filename
1764 video_ext = filename.rpartition('.')[2]
1769 'title': video_title,
1770 'duration': duration,
1774 class LiveLeakIE(InfoExtractor):
1776 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1777 IE_NAME = u'liveleak'
1779 def _real_extract(self, url):
1780 mobj = re.match(self._VALID_URL, url)
1782 raise ExtractorError(u'Invalid URL: %s' % url)
1784 video_id = mobj.group('video_id')
1786 webpage = self._download_webpage(url, video_id)
1788 video_url = self._search_regex(r'file: "(.*?)",',
1789 webpage, u'video URL')
1791 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1792 webpage, u'title').replace('LiveLeak.com -', '').strip()
1794 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1795 webpage, u'description', fatal=False)
1797 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1798 webpage, u'uploader', fatal=False)
1804 'title': video_title,
1805 'description': video_description,
1806 'uploader': video_uploader
1813 class TumblrIE(InfoExtractor):
1814 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1816 def _real_extract(self, url):
1817 m_url = re.match(self._VALID_URL, url)
1818 video_id = m_url.group('id')
1819 blog = m_url.group('blog_name')
1821 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1822 webpage = self._download_webpage(url, video_id)
1824 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1825 video = re.search(re_video, webpage)
1827 raise ExtractorError(u'Unable to extract video')
1828 video_url = video.group('video_url')
1829 ext = video.group('ext')
1831 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1832 webpage, u'thumbnail', fatal=False) # We pick the first poster
1833 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1835 # The only place where you can get a title, it's not complete,
1836 # but searching in other places doesn't work for all videos
1837 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1838 webpage, u'title', flags=re.DOTALL)
1840 return [{'id': video_id,
1842 'title': video_title,
1843 'thumbnail': video_thumbnail,
1847 class BandcampIE(InfoExtractor):
1848 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1850 def _real_extract(self, url):
1851 mobj = re.match(self._VALID_URL, url)
1852 title = mobj.group('title')
1853 webpage = self._download_webpage(url, title)
1854 # We get the link to the free download page
1855 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1856 if m_download is None:
1857 raise ExtractorError(u'No free songs found')
1859 download_link = m_download.group(1)
1860 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1861 webpage, re.MULTILINE|re.DOTALL).group('id')
1863 download_webpage = self._download_webpage(download_link, id,
1864 'Downloading free downloads page')
1865 # We get the dictionary of the track from some javascrip code
1866 info = re.search(r'items: (.*?),$',
1867 download_webpage, re.MULTILINE).group(1)
1868 info = json.loads(info)[0]
1869 # We pick mp3-320 for now, until format selection can be easily implemented.
1870 mp3_info = info[u'downloads'][u'mp3-320']
1871 # If we try to use this url it says the link has expired
1872 initial_url = mp3_info[u'url']
1873 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1874 m_url = re.match(re_url, initial_url)
1875 #We build the url we will use to get the final track url
1876 # This url is build in Bandcamp in the script download_bunde_*.js
1877 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1878 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1879 # If we could correctly generate the .rand field the url would be
1880 #in the "download_url" key
1881 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1883 track_info = {'id':id,
1884 'title' : info[u'title'],
1887 'thumbnail' : info[u'thumb_url'],
1888 'uploader' : info[u'artist']
1893 class RedTubeIE(InfoExtractor):
1894 """Information Extractor for redtube"""
1895 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1897 def _real_extract(self,url):
1898 mobj = re.match(self._VALID_URL, url)
1900 raise ExtractorError(u'Invalid URL: %s' % url)
1902 video_id = mobj.group('id')
1903 video_extension = 'mp4'
1904 webpage = self._download_webpage(url, video_id)
1906 self.report_extraction(video_id)
1908 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1909 webpage, u'video URL')
1911 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1917 'ext': video_extension,
1918 'title': video_title,
1921 class InaIE(InfoExtractor):
1922 """Information Extractor for Ina.fr"""
1923 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1925 def _real_extract(self,url):
1926 mobj = re.match(self._VALID_URL, url)
1928 video_id = mobj.group('id')
1929 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1930 video_extension = 'mp4'
1931 webpage = self._download_webpage(mrss_url, video_id)
1933 self.report_extraction(video_id)
1935 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1936 webpage, u'video URL')
1938 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1944 'ext': video_extension,
1945 'title': video_title,
1948 class HowcastIE(InfoExtractor):
1949 """Information Extractor for Howcast.com"""
1950 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1952 def _real_extract(self, url):
1953 mobj = re.match(self._VALID_URL, url)
1955 video_id = mobj.group('id')
1956 webpage_url = 'http://www.howcast.com/videos/' + video_id
1957 webpage = self._download_webpage(webpage_url, video_id)
1959 self.report_extraction(video_id)
1961 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1962 webpage, u'video URL')
1964 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1967 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1968 webpage, u'description', fatal=False)
1970 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1971 webpage, u'thumbnail', fatal=False)
1977 'title': video_title,
1978 'description': video_description,
1979 'thumbnail': thumbnail,
1982 class VineIE(InfoExtractor):
1983 """Information Extractor for Vine.co"""
1984 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1986 def _real_extract(self, url):
1987 mobj = re.match(self._VALID_URL, url)
1989 video_id = mobj.group('id')
1990 webpage_url = 'https://vine.co/v/' + video_id
1991 webpage = self._download_webpage(webpage_url, video_id)
1993 self.report_extraction(video_id)
1995 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1996 webpage, u'video URL')
1998 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2001 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2002 webpage, u'thumbnail', fatal=False)
2004 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2005 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2011 'title': video_title,
2012 'thumbnail': thumbnail,
2013 'uploader': uploader,
2016 class FlickrIE(InfoExtractor):
2017 """Information Extractor for Flickr videos"""
2018 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2020 def _real_extract(self, url):
2021 mobj = re.match(self._VALID_URL, url)
2023 video_id = mobj.group('id')
2024 video_uploader_id = mobj.group('uploader_id')
2025 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2026 webpage = self._download_webpage(webpage_url, video_id)
2028 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2030 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2031 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2033 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2034 first_xml, u'node_id')
2036 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2037 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2039 self.report_extraction(video_id)
2041 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2043 raise ExtractorError(u'Unable to extract video url')
2044 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2046 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2047 webpage, u'video title')
2049 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2050 webpage, u'description', fatal=False)
2052 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2053 webpage, u'thumbnail', fatal=False)
2059 'title': video_title,
2060 'description': video_description,
2061 'thumbnail': thumbnail,
2062 'uploader_id': video_uploader_id,
2065 class TeamcocoIE(InfoExtractor):
2066 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2068 def _real_extract(self, url):
2069 mobj = re.match(self._VALID_URL, url)
2071 raise ExtractorError(u'Invalid URL: %s' % url)
2072 url_title = mobj.group('url_title')
2073 webpage = self._download_webpage(url, url_title)
2075 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2076 webpage, u'video id')
2078 self.report_extraction(video_id)
2080 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2083 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2084 webpage, u'thumbnail', fatal=False)
2086 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2087 webpage, u'description', fatal=False)
2089 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2090 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2092 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2099 'title': video_title,
2100 'thumbnail': thumbnail,
2101 'description': video_description,
2104 class XHamsterIE(InfoExtractor):
2105 """Information Extractor for xHamster"""
2106 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2108 def _real_extract(self,url):
2109 mobj = re.match(self._VALID_URL, url)
2111 video_id = mobj.group('id')
2112 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2113 webpage = self._download_webpage(mrss_url, video_id)
2115 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2117 raise ExtractorError(u'Unable to extract media URL')
2118 if len(mobj.group('server')) == 0:
2119 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2121 video_url = mobj.group('server')+'/key='+mobj.group('file')
2122 video_extension = video_url.split('.')[-1]
2124 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2127 # Can't see the description anywhere in the UI
2128 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2129 # webpage, u'description', fatal=False)
2130 # if video_description: video_description = unescapeHTML(video_description)
2132 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2134 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2136 video_upload_date = None
2137 self._downloader.report_warning(u'Unable to extract upload date')
2139 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2140 webpage, u'uploader id', default=u'anonymous')
2142 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2143 webpage, u'thumbnail', fatal=False)
2148 'ext': video_extension,
2149 'title': video_title,
2150 # 'description': video_description,
2151 'upload_date': video_upload_date,
2152 'uploader_id': video_uploader_id,
2153 'thumbnail': video_thumbnail
2156 class HypemIE(InfoExtractor):
2157 """Information Extractor for hypem"""
2158 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2160 def _real_extract(self, url):
2161 mobj = re.match(self._VALID_URL, url)
2163 raise ExtractorError(u'Invalid URL: %s' % url)
2164 track_id = mobj.group(1)
2166 data = { 'ax': 1, 'ts': time.time() }
2167 data_encoded = compat_urllib_parse.urlencode(data)
2168 complete_url = url + "?" + data_encoded
2169 request = compat_urllib_request.Request(complete_url)
2170 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2171 cookie = urlh.headers.get('Set-Cookie', '')
2173 self.report_extraction(track_id)
2175 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2176 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2178 track_list = json.loads(html_tracks)
2179 track = track_list[u'tracks'][0]
2181 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2184 track_id = track[u"id"]
2185 artist = track[u"artist"]
2186 title = track[u"song"]
2188 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2189 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2190 request.add_header('cookie', cookie)
2191 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2193 song_data = json.loads(song_data_json)
2195 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2196 final_url = song_data[u"url"]
2206 class Vbox7IE(InfoExtractor):
2207 """Information Extractor for Vbox7"""
2208 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2210 def _real_extract(self,url):
2211 mobj = re.match(self._VALID_URL, url)
2213 raise ExtractorError(u'Invalid URL: %s' % url)
2214 video_id = mobj.group(1)
2216 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2217 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2218 redirect_url = urlh.geturl() + new_location
2219 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2221 title = self._html_search_regex(r'<title>(.*)</title>',
2222 webpage, u'title').split('/')[0].strip()
2225 info_url = "http://vbox7.com/play/magare.do"
2226 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2227 info_request = compat_urllib_request.Request(info_url, data)
2228 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2229 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2230 if info_response is None:
2231 raise ExtractorError(u'Unable to extract the media url')
2232 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2239 'thumbnail': thumbnail_url,
2243 def gen_extractors():
2244 """ Return a list of an instance of every supported extractor.
2245 The order does matter; the first extractor matched is the one handling the URL.
2248 YoutubePlaylistIE(),
2273 StanfordOpenClassroomIE(),
2283 WorldStarHipHopIE(),
2313 def get_info_extractor(ie_name):
2314 """Returns the info extractor class with the given ie_name"""
2315 return globals()[ie_name+'IE']