10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.dailymotion import DailymotionIE
26 from .extractor.gametrailers import GametrailersIE
27 from .extractor.generic import GenericIE
28 from .extractor.metacafe import MetacafeIE
29 from .extractor.myvideo import MyVideoIE
30 from .extractor.statigram import StatigramIE
31 from .extractor.photobucket import PhotobucketIE
32 from .extractor.vimeo import VimeoIE
33 from .extractor.yahoo import YahooIE, YahooSearchIE
34 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
35 from .extractor.zdf import ZDFIE
55 class DepositFilesIE(InfoExtractor):
56 """Information extractor for depositfiles.com"""
58 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
60 def _real_extract(self, url):
61 file_id = url.split('/')[-1]
62 # Rebuild url in english locale
63 url = 'http://depositfiles.com/en/files/' + file_id
65 # Retrieve file webpage with 'Free download' button pressed
66 free_download_indication = { 'gateway_result' : '1' }
67 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
69 self.report_download_webpage(file_id)
70 webpage = compat_urllib_request.urlopen(request).read()
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
74 # Search for the real file URL
75 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
76 if (mobj is None) or (mobj.group(1) is None):
77 # Try to figure out reason of the error.
78 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
79 if (mobj is not None) and (mobj.group(1) is not None):
80 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
81 raise ExtractorError(u'%s' % restriction_message)
83 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
85 file_url = mobj.group(1)
86 file_extension = os.path.splitext(file_url)[1][1:]
88 # Search for file title
89 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
92 'id': file_id.decode('utf-8'),
93 'url': file_url.decode('utf-8'),
97 'ext': file_extension.decode('utf-8'),
101 class FacebookIE(InfoExtractor):
102 """Information Extractor for Facebook"""
104 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
105 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
106 _NETRC_MACHINE = 'facebook'
107 IE_NAME = u'facebook'
109 def report_login(self):
110 """Report attempt to log in."""
111 self.to_screen(u'Logging in')
113 def _real_initialize(self):
114 if self._downloader is None:
119 downloader_params = self._downloader.params
121 # Attempt to use provided username and password or .netrc data
122 if downloader_params.get('username', None) is not None:
123 useremail = downloader_params['username']
124 password = downloader_params['password']
125 elif downloader_params.get('usenetrc', False):
127 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
132 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
133 except (IOError, netrc.NetrcParseError) as err:
134 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
137 if useremail is None:
146 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
149 login_results = compat_urllib_request.urlopen(request).read()
150 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
151 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
154 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
157 def _real_extract(self, url):
158 mobj = re.match(self._VALID_URL, url)
160 raise ExtractorError(u'Invalid URL: %s' % url)
161 video_id = mobj.group('ID')
163 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
164 webpage = self._download_webpage(url, video_id)
166 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
167 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
168 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
170 raise ExtractorError(u'Cannot parse data')
171 data = dict(json.loads(m.group(1)))
172 params_raw = compat_urllib_parse.unquote(data['params'])
173 params = json.loads(params_raw)
174 video_data = params['video_data'][0]
175 video_url = video_data.get('hd_src')
177 video_url = video_data['sd_src']
179 raise ExtractorError(u'Cannot find video URL')
180 video_duration = int(video_data['video_duration'])
181 thumbnail = video_data['thumbnail_src']
183 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
188 'title': video_title,
191 'duration': video_duration,
192 'thumbnail': thumbnail,
202 class EscapistIE(InfoExtractor):
203 """Information extractor for The Escapist """
205 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
206 IE_NAME = u'escapist'
208 def _real_extract(self, url):
209 mobj = re.match(self._VALID_URL, url)
211 raise ExtractorError(u'Invalid URL: %s' % url)
212 showName = mobj.group('showname')
213 videoId = mobj.group('episode')
215 self.report_extraction(videoId)
216 webpage = self._download_webpage(url, videoId)
218 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
219 webpage, u'description', fatal=False)
221 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
222 webpage, u'thumbnail', fatal=False)
224 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
225 webpage, u'player url')
227 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
228 webpage, u'player url').split(' : ')[-1]
230 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
231 configUrl = compat_urllib_parse.unquote(configUrl)
233 configJSON = self._download_webpage(configUrl, videoId,
234 u'Downloading configuration',
235 u'unable to download configuration')
237 # Technically, it's JavaScript, not JSON
238 configJSON = configJSON.replace("'", '"')
241 config = json.loads(configJSON)
242 except (ValueError,) as err:
243 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
245 playlist = config['playlist']
246 videoUrl = playlist[1]['url']
251 'uploader': showName,
256 'description': videoDesc,
257 'player_url': playerUrl,
262 class CollegeHumorIE(InfoExtractor):
263 """Information extractor for collegehumor.com"""
266 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
267 IE_NAME = u'collegehumor'
269 def report_manifest(self, video_id):
270 """Report information extraction."""
271 self.to_screen(u'%s: Downloading XML manifest' % video_id)
273 def _real_extract(self, url):
274 mobj = re.match(self._VALID_URL, url)
276 raise ExtractorError(u'Invalid URL: %s' % url)
277 video_id = mobj.group('videoid')
285 self.report_extraction(video_id)
286 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
288 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
289 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
290 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
292 mdoc = xml.etree.ElementTree.fromstring(metaXml)
294 videoNode = mdoc.findall('./video')[0]
295 info['description'] = videoNode.findall('./description')[0].text
296 info['title'] = videoNode.findall('./caption')[0].text
297 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
298 manifest_url = videoNode.findall('./file')[0].text
300 raise ExtractorError(u'Invalid metadata XML file')
302 manifest_url += '?hdcore=2.10.3'
303 self.report_manifest(video_id)
305 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
306 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
307 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
309 adoc = xml.etree.ElementTree.fromstring(manifestXml)
311 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
312 node_id = media_node.attrib['url']
313 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
314 except IndexError as err:
315 raise ExtractorError(u'Invalid manifest file')
317 url_pr = compat_urllib_parse_urlparse(manifest_url)
318 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
325 class XVideosIE(InfoExtractor):
326 """Information extractor for xvideos.com"""
328 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
331 def _real_extract(self, url):
332 mobj = re.match(self._VALID_URL, url)
334 raise ExtractorError(u'Invalid URL: %s' % url)
335 video_id = mobj.group(1)
337 webpage = self._download_webpage(url, video_id)
339 self.report_extraction(video_id)
342 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
343 webpage, u'video URL'))
346 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
349 # Extract video thumbnail
350 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
351 webpage, u'thumbnail', fatal=False)
358 'title': video_title,
360 'thumbnail': video_thumbnail,
367 class SoundcloudIE(InfoExtractor):
368 """Information extractor for soundcloud.com
369 To access the media, the uid of the song and a stream token
370 must be extracted from the page source and the script must make
371 a request to media.soundcloud.com/crossdomain.xml. Then
372 the media can be grabbed by requesting from an url composed
373 of the stream token and uid
376 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
377 IE_NAME = u'soundcloud'
379 def report_resolve(self, video_id):
380 """Report information extraction."""
381 self.to_screen(u'%s: Resolving id' % video_id)
383 def _real_extract(self, url):
384 mobj = re.match(self._VALID_URL, url)
386 raise ExtractorError(u'Invalid URL: %s' % url)
388 # extract uploader (which is in the url)
389 uploader = mobj.group(1)
390 # extract simple title (uploader + slug of song title)
391 slug_title = mobj.group(2)
392 simple_title = uploader + u'-' + slug_title
393 full_title = '%s/%s' % (uploader, slug_title)
395 self.report_resolve(full_title)
397 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
398 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
399 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
401 info = json.loads(info_json)
402 video_id = info['id']
403 self.report_extraction(full_title)
405 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
406 stream_json = self._download_webpage(streams_url, full_title,
407 u'Downloading stream definitions',
408 u'unable to download stream definitions')
410 streams = json.loads(stream_json)
411 mediaURL = streams['http_mp3_128_url']
412 upload_date = unified_strdate(info['created_at'])
417 'uploader': info['user']['username'],
418 'upload_date': upload_date,
419 'title': info['title'],
421 'description': info['description'],
424 class SoundcloudSetIE(InfoExtractor):
425 """Information extractor for soundcloud.com sets
426 To access the media, the uid of the song and a stream token
427 must be extracted from the page source and the script must make
428 a request to media.soundcloud.com/crossdomain.xml. Then
429 the media can be grabbed by requesting from an url composed
430 of the stream token and uid
433 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
434 IE_NAME = u'soundcloud:set'
436 def report_resolve(self, video_id):
437 """Report information extraction."""
438 self.to_screen(u'%s: Resolving id' % video_id)
440 def _real_extract(self, url):
441 mobj = re.match(self._VALID_URL, url)
443 raise ExtractorError(u'Invalid URL: %s' % url)
445 # extract uploader (which is in the url)
446 uploader = mobj.group(1)
447 # extract simple title (uploader + slug of song title)
448 slug_title = mobj.group(2)
449 simple_title = uploader + u'-' + slug_title
450 full_title = '%s/sets/%s' % (uploader, slug_title)
452 self.report_resolve(full_title)
454 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
455 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
456 info_json = self._download_webpage(resolv_url, full_title)
459 info = json.loads(info_json)
461 for err in info['errors']:
462 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
465 self.report_extraction(full_title)
466 for track in info['tracks']:
467 video_id = track['id']
469 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
470 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
472 self.report_extraction(video_id)
473 streams = json.loads(stream_json)
474 mediaURL = streams['http_mp3_128_url']
479 'uploader': track['user']['username'],
480 'upload_date': unified_strdate(track['created_at']),
481 'title': track['title'],
483 'description': track['description'],
488 class InfoQIE(InfoExtractor):
489 """Information extractor for infoq.com"""
490 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
492 def _real_extract(self, url):
493 mobj = re.match(self._VALID_URL, url)
495 raise ExtractorError(u'Invalid URL: %s' % url)
497 webpage = self._download_webpage(url, video_id=url)
498 self.report_extraction(url)
501 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
503 raise ExtractorError(u'Unable to extract video url')
504 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
505 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
508 video_title = self._search_regex(r'contentTitle = "(.*?)";',
511 # Extract description
512 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
513 webpage, u'description', fatal=False)
515 video_filename = video_url.split('/')[-1]
516 video_id, extension = video_filename.split('.')
523 'title': video_title,
524 'ext': extension, # Extension is always(?) mp4, but seems to be flv
526 'description': video_description,
531 class MixcloudIE(InfoExtractor):
532 """Information extractor for www.mixcloud.com"""
534 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
535 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
536 IE_NAME = u'mixcloud'
538 def report_download_json(self, file_id):
539 """Report JSON download."""
540 self.to_screen(u'Downloading json')
542 def get_urls(self, jsonData, fmt, bitrate='best'):
543 """Get urls from 'audio_formats' section in json"""
546 bitrate_list = jsonData[fmt]
547 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
548 bitrate = max(bitrate_list) # select highest
550 url_list = jsonData[fmt][bitrate]
551 except TypeError: # we have no bitrate info.
552 url_list = jsonData[fmt]
555 def check_urls(self, url_list):
556 """Returns 1st active url from list"""
559 compat_urllib_request.urlopen(url)
561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
566 def _print_formats(self, formats):
567 print('Available formats:')
568 for fmt in formats.keys():
569 for b in formats[fmt]:
571 ext = formats[fmt][b][0]
572 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
573 except TypeError: # we have no bitrate info
574 ext = formats[fmt][0]
575 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
578 def _real_extract(self, url):
579 mobj = re.match(self._VALID_URL, url)
581 raise ExtractorError(u'Invalid URL: %s' % url)
582 # extract uploader & filename from url
583 uploader = mobj.group(1).decode('utf-8')
584 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
586 # construct API request
587 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
588 # retrieve .json file with links to files
589 request = compat_urllib_request.Request(file_url)
591 self.report_download_json(file_url)
592 jsonData = compat_urllib_request.urlopen(request).read()
593 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
594 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
597 json_data = json.loads(jsonData)
598 player_url = json_data['player_swf_url']
599 formats = dict(json_data['audio_formats'])
601 req_format = self._downloader.params.get('format', None)
604 if self._downloader.params.get('listformats', None):
605 self._print_formats(formats)
608 if req_format is None or req_format == 'best':
609 for format_param in formats.keys():
610 url_list = self.get_urls(formats, format_param)
612 file_url = self.check_urls(url_list)
613 if file_url is not None:
616 if req_format not in formats:
617 raise ExtractorError(u'Format is not available')
619 url_list = self.get_urls(formats, req_format)
620 file_url = self.check_urls(url_list)
621 format_param = req_format
624 'id': file_id.decode('utf-8'),
625 'url': file_url.decode('utf-8'),
626 'uploader': uploader.decode('utf-8'),
628 'title': json_data['name'],
629 'ext': file_url.split('.')[-1].decode('utf-8'),
630 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
631 'thumbnail': json_data['thumbnail_url'],
632 'description': json_data['description'],
633 'player_url': player_url.decode('utf-8'),
636 class StanfordOpenClassroomIE(InfoExtractor):
637 """Information extractor for Stanford's Open ClassRoom"""
639 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
640 IE_NAME = u'stanfordoc'
642 def _real_extract(self, url):
643 mobj = re.match(self._VALID_URL, url)
645 raise ExtractorError(u'Invalid URL: %s' % url)
647 if mobj.group('course') and mobj.group('video'): # A specific video
648 course = mobj.group('course')
649 video = mobj.group('video')
651 'id': course + '_' + video,
656 self.report_extraction(info['id'])
657 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
658 xmlUrl = baseUrl + video + '.xml'
660 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
661 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
662 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
663 mdoc = xml.etree.ElementTree.fromstring(metaXml)
665 info['title'] = mdoc.findall('./title')[0].text
666 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
668 raise ExtractorError(u'Invalid metadata XML file')
669 info['ext'] = info['url'].rpartition('.')[2]
671 elif mobj.group('course'): # A course page
672 course = mobj.group('course')
680 coursepage = self._download_webpage(url, info['id'],
681 note='Downloading course info page',
682 errnote='Unable to download course info page')
684 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
686 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
687 coursepage, u'description', fatal=False)
689 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
693 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
697 for entry in info['list']:
698 assert entry['type'] == 'reference'
699 results += self.extract(entry['url'])
703 'id': 'Stanford OpenClassroom',
709 self.report_download_webpage(info['id'])
710 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
712 rootpage = compat_urllib_request.urlopen(rootURL).read()
713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
714 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
716 info['title'] = info['id']
718 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
722 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
727 for entry in info['list']:
728 assert entry['type'] == 'reference'
729 results += self.extract(entry['url'])
732 class MTVIE(InfoExtractor):
733 """Information extractor for MTV.com"""
735 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
738 def _real_extract(self, url):
739 mobj = re.match(self._VALID_URL, url)
741 raise ExtractorError(u'Invalid URL: %s' % url)
742 if not mobj.group('proto'):
743 url = 'http://' + url
744 video_id = mobj.group('videoid')
746 webpage = self._download_webpage(url, video_id)
748 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
749 webpage, u'song name', fatal=False)
751 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
754 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
755 webpage, u'mtvn_uri', fatal=False)
757 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
758 webpage, u'content id', fatal=False)
760 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
761 self.report_extraction(video_id)
762 request = compat_urllib_request.Request(videogen_url)
764 metadataXml = compat_urllib_request.urlopen(request).read()
765 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
766 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
768 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
769 renditions = mdoc.findall('.//rendition')
771 # For now, always pick the highest quality.
772 rendition = renditions[-1]
775 _,_,ext = rendition.attrib['type'].partition('/')
776 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
777 video_url = rendition.find('./src').text
779 raise ExtractorError('Invalid rendition field.')
784 'uploader': performer,
786 'title': video_title,
794 class YoukuIE(InfoExtractor):
795 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
798 nowTime = int(time.time() * 1000)
799 random1 = random.randint(1000,1998)
800 random2 = random.randint(1000,9999)
802 return "%d%d%d" %(nowTime,random1,random2)
804 def _get_file_ID_mix_string(self, seed):
806 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
808 for i in range(len(source)):
809 seed = (seed * 211 + 30031 ) % 65536
810 index = math.floor(seed / 65536 * len(source) )
811 mixed.append(source[int(index)])
812 source.remove(source[int(index)])
813 #return ''.join(mixed)
816 def _get_file_id(self, fileId, seed):
817 mixed = self._get_file_ID_mix_string(seed)
818 ids = fileId.split('*')
822 realId.append(mixed[int(ch)])
823 return ''.join(realId)
825 def _real_extract(self, url):
826 mobj = re.match(self._VALID_URL, url)
828 raise ExtractorError(u'Invalid URL: %s' % url)
829 video_id = mobj.group('ID')
831 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
833 jsondata = self._download_webpage(info_url, video_id)
835 self.report_extraction(video_id)
837 config = json.loads(jsondata)
839 video_title = config['data'][0]['title']
840 seed = config['data'][0]['seed']
842 format = self._downloader.params.get('format', None)
843 supported_format = list(config['data'][0]['streamfileids'].keys())
845 if format is None or format == 'best':
846 if 'hd2' in supported_format:
851 elif format == 'worst':
859 fileid = config['data'][0]['streamfileids'][format]
860 keys = [s['k'] for s in config['data'][0]['segs'][format]]
861 except (UnicodeDecodeError, ValueError, KeyError):
862 raise ExtractorError(u'Unable to extract info section')
865 sid = self._gen_sid()
866 fileid = self._get_file_id(fileid, seed)
868 #column 8,9 of fileid represent the segment number
869 #fileid[7:9] should be changed
870 for index, key in enumerate(keys):
872 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
873 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
876 'id': '%s_part%02d' % (video_id, index),
880 'title': video_title,
883 files_info.append(info)
888 class XNXXIE(InfoExtractor):
889 """Information extractor for xnxx.com"""
891 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
893 VIDEO_URL_RE = r'flv_url=(.*?)&'
894 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
895 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
897 def _real_extract(self, url):
898 mobj = re.match(self._VALID_URL, url)
900 raise ExtractorError(u'Invalid URL: %s' % url)
901 video_id = mobj.group(1)
903 # Get webpage content
904 webpage = self._download_webpage(url, video_id)
906 video_url = self._search_regex(self.VIDEO_URL_RE,
907 webpage, u'video URL')
908 video_url = compat_urllib_parse.unquote(video_url)
910 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
913 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
914 webpage, u'thumbnail', fatal=False)
921 'title': video_title,
923 'thumbnail': video_thumbnail,
928 class GooglePlusIE(InfoExtractor):
929 """Information extractor for plus.google.com."""
931 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
932 IE_NAME = u'plus.google'
934 def _real_extract(self, url):
935 # Extract id from URL
936 mobj = re.match(self._VALID_URL, url)
938 raise ExtractorError(u'Invalid URL: %s' % url)
940 post_url = mobj.group(0)
941 video_id = mobj.group(1)
943 video_extension = 'flv'
945 # Step 1, Retrieve post webpage to extract further information
946 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
948 self.report_extraction(video_id)
950 # Extract update date
951 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
952 webpage, u'upload date', fatal=False)
954 # Convert timestring to a format suitable for filename
955 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
956 upload_date = upload_date.strftime('%Y%m%d')
959 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
960 webpage, u'uploader', fatal=False)
963 # Get the first line for title
964 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
965 webpage, 'title', default=u'NA')
967 # Step 2, Stimulate clicking the image box to launch video
968 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
969 webpage, u'video page URL')
970 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
972 # Extract video links on video page
973 """Extract video links of all sizes"""
974 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
975 mobj = re.findall(pattern, webpage)
977 raise ExtractorError(u'Unable to extract video links')
982 # Choose the lowest of the sort, i.e. highest resolution
983 video_url = links[-1]
984 # Only get the url. The resolution part in the tuple has no use anymore
985 video_url = video_url[-1]
986 # Treat escaped \u0026 style hex
988 video_url = video_url.decode("unicode_escape")
989 except AttributeError: # Python 3
990 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
996 'uploader': uploader,
997 'upload_date': upload_date,
998 'title': video_title,
999 'ext': video_extension,
1002 class NBAIE(InfoExtractor):
1003 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1006 def _real_extract(self, url):
1007 mobj = re.match(self._VALID_URL, url)
1009 raise ExtractorError(u'Invalid URL: %s' % url)
1011 video_id = mobj.group(1)
1013 webpage = self._download_webpage(url, video_id)
1015 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1017 shortened_video_id = video_id.rpartition('/')[2]
1018 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1019 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1021 # It isn't there in the HTML it returns to us
1022 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1024 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1027 'id': shortened_video_id,
1031 # 'uploader_date': uploader_date,
1032 'description': description,
1036 class JustinTVIE(InfoExtractor):
1037 """Information extractor for justin.tv and twitch.tv"""
1038 # TODO: One broadcast may be split into multiple videos. The key
1039 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1040 # starts at 1 and increases. Can we treat all parts as one video?
1042 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1044 (?P<channelid>[^/]+)|
1045 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1046 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1050 _JUSTIN_PAGE_LIMIT = 100
1051 IE_NAME = u'justin.tv'
1053 def report_download_page(self, channel, offset):
1054 """Report attempt to download a single page of videos."""
1055 self.to_screen(u'%s: Downloading video information from %d to %d' %
1056 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1058 # Return count of items, list of *valid* items
1059 def _parse_page(self, url, video_id):
1060 webpage = self._download_webpage(url, video_id,
1061 u'Downloading video info JSON',
1062 u'unable to download video info JSON')
1064 response = json.loads(webpage)
1065 if type(response) != list:
1066 error_text = response.get('error', 'unknown error')
1067 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1069 for clip in response:
1070 video_url = clip['video_file_url']
1072 video_extension = os.path.splitext(video_url)[1][1:]
1073 video_date = re.sub('-', '', clip['start_time'][:10])
1074 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1075 video_id = clip['id']
1076 video_title = clip.get('title', video_id)
1080 'title': video_title,
1081 'uploader': clip.get('channel_name', video_uploader_id),
1082 'uploader_id': video_uploader_id,
1083 'upload_date': video_date,
1084 'ext': video_extension,
1086 return (len(response), info)
1088 def _real_extract(self, url):
1089 mobj = re.match(self._VALID_URL, url)
1091 raise ExtractorError(u'invalid URL: %s' % url)
1093 api_base = 'http://api.justin.tv'
1095 if mobj.group('channelid'):
1097 video_id = mobj.group('channelid')
1098 api = api_base + '/channel/archives/%s.json' % video_id
1099 elif mobj.group('chapterid'):
1100 chapter_id = mobj.group('chapterid')
1102 webpage = self._download_webpage(url, chapter_id)
1103 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1105 raise ExtractorError(u'Cannot find archive of a chapter')
1106 archive_id = m.group(1)
1108 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1109 chapter_info_xml = self._download_webpage(api, chapter_id,
1110 note=u'Downloading chapter information',
1111 errnote=u'Chapter information download failed')
1112 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1113 for a in doc.findall('.//archive'):
1114 if archive_id == a.find('./id').text:
1117 raise ExtractorError(u'Could not find chapter in chapter information')
1119 video_url = a.find('./video_file_url').text
1120 video_ext = video_url.rpartition('.')[2] or u'flv'
1122 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1123 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1124 note='Downloading chapter metadata',
1125 errnote='Download of chapter metadata failed')
1126 chapter_info = json.loads(chapter_info_json)
1128 bracket_start = int(doc.find('.//bracket_start').text)
1129 bracket_end = int(doc.find('.//bracket_end').text)
1131 # TODO determine start (and probably fix up file)
1132 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1133 #video_url += u'?start=' + TODO:start_timestamp
1134 # bracket_start is 13290, but we want 51670615
1135 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1136 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1139 'id': u'c' + chapter_id,
1142 'title': chapter_info['title'],
1143 'thumbnail': chapter_info['preview'],
1144 'description': chapter_info['description'],
1145 'uploader': chapter_info['channel']['display_name'],
1146 'uploader_id': chapter_info['channel']['name'],
1150 video_id = mobj.group('videoid')
1151 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1153 self.report_extraction(video_id)
1157 limit = self._JUSTIN_PAGE_LIMIT
1160 self.report_download_page(video_id, offset)
1161 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1162 page_count, page_info = self._parse_page(page_url, video_id)
1163 info.extend(page_info)
1164 if not paged or page_count != limit:
1169 class FunnyOrDieIE(InfoExtractor):
1170 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1172 def _real_extract(self, url):
1173 mobj = re.match(self._VALID_URL, url)
1175 raise ExtractorError(u'invalid URL: %s' % url)
1177 video_id = mobj.group('id')
1178 webpage = self._download_webpage(url, video_id)
1180 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1181 webpage, u'video URL', flags=re.DOTALL)
1183 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1184 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1186 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1187 webpage, u'description', fatal=False, flags=re.DOTALL)
1194 'description': video_description,
1198 class SteamIE(InfoExtractor):
1199 _VALID_URL = r"""http://store\.steampowered\.com/
1201 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1203 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1205 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1206 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1209 def suitable(cls, url):
1210 """Receives a URL and returns True if suitable for this IE."""
1211 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1213 def _real_extract(self, url):
1214 m = re.match(self._VALID_URL, url, re.VERBOSE)
1215 gameID = m.group('gameID')
1217 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1218 webpage = self._download_webpage(videourl, gameID)
1220 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1221 videourl = self._AGECHECK_TEMPLATE % gameID
1222 self.report_age_confirmation()
1223 webpage = self._download_webpage(videourl, gameID)
1225 self.report_extraction(gameID)
1226 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1227 webpage, 'game title')
1229 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1230 mweb = re.finditer(urlRE, webpage)
1231 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1232 titles = re.finditer(namesRE, webpage)
1233 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1234 thumbs = re.finditer(thumbsRE, webpage)
1236 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1237 video_id = vid.group('videoID')
1238 title = vtitle.group('videoName')
1239 video_url = vid.group('videoURL')
1240 video_thumb = thumb.group('thumbnail')
1242 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1247 'title': unescapeHTML(title),
1248 'thumbnail': video_thumb
1251 return [self.playlist_result(videos, gameID, game_title)]
1253 class UstreamIE(InfoExtractor):
1254 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1255 IE_NAME = u'ustream'
1257 def _real_extract(self, url):
1258 m = re.match(self._VALID_URL, url)
1259 video_id = m.group('videoID')
1261 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1262 webpage = self._download_webpage(url, video_id)
1264 self.report_extraction(video_id)
1266 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1269 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1270 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1272 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1273 webpage, u'thumbnail', fatal=False)
1279 'title': video_title,
1280 'uploader': uploader,
1281 'thumbnail': thumbnail,
1285 class WorldStarHipHopIE(InfoExtractor):
1286 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1287 IE_NAME = u'WorldStarHipHop'
1289 def _real_extract(self, url):
1290 m = re.match(self._VALID_URL, url)
1291 video_id = m.group('id')
1293 webpage_src = self._download_webpage(url, video_id)
1295 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1296 webpage_src, u'video URL')
1298 if 'mp4' in video_url:
1303 video_title = self._html_search_regex(r"<title>(.*)</title>",
1304 webpage_src, u'title')
1306 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1307 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1308 webpage_src, u'thumbnail', fatal=False)
1311 _title = r"""candytitles.*>(.*)</span>"""
1312 mobj = re.search(_title, webpage_src)
1313 if mobj is not None:
1314 video_title = mobj.group(1)
1319 'title' : video_title,
1320 'thumbnail' : thumbnail,
1325 class RBMARadioIE(InfoExtractor):
1326 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1328 def _real_extract(self, url):
1329 m = re.match(self._VALID_URL, url)
1330 video_id = m.group('videoID')
1332 webpage = self._download_webpage(url, video_id)
1334 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1335 webpage, u'json data', flags=re.MULTILINE)
1338 data = json.loads(json_data)
1339 except ValueError as e:
1340 raise ExtractorError(u'Invalid JSON: ' + str(e))
1342 video_url = data['akamai_url'] + '&cbr=256'
1343 url_parts = compat_urllib_parse_urlparse(video_url)
1344 video_ext = url_parts.path.rpartition('.')[2]
1349 'title': data['title'],
1350 'description': data.get('teaser_text'),
1351 'location': data.get('country_of_origin'),
1352 'uploader': data.get('host', {}).get('name'),
1353 'uploader_id': data.get('host', {}).get('slug'),
1354 'thumbnail': data.get('image', {}).get('large_url_2x'),
1355 'duration': data.get('duration'),
1360 class YouPornIE(InfoExtractor):
1361 """Information extractor for youporn.com."""
1362 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1364 def _print_formats(self, formats):
1365 """Print all available formats"""
1366 print(u'Available formats:')
1367 print(u'ext\t\tformat')
1368 print(u'---------------------------------')
1369 for format in formats:
1370 print(u'%s\t\t%s' % (format['ext'], format['format']))
1372 def _specific(self, req_format, formats):
1374 if(x["format"]==req_format):
1378 def _real_extract(self, url):
1379 mobj = re.match(self._VALID_URL, url)
1381 raise ExtractorError(u'Invalid URL: %s' % url)
1382 video_id = mobj.group('videoid')
1384 req = compat_urllib_request.Request(url)
1385 req.add_header('Cookie', 'age_verified=1')
1386 webpage = self._download_webpage(req, video_id)
1388 # Get JSON parameters
1389 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1391 params = json.loads(json_params)
1393 raise ExtractorError(u'Invalid JSON')
1395 self.report_extraction(video_id)
1397 video_title = params['title']
1398 upload_date = unified_strdate(params['release_date_f'])
1399 video_description = params['description']
1400 video_uploader = params['submitted_by']
1401 thumbnail = params['thumbnails'][0]['image']
1403 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1405 # Get all of the formats available
1406 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1407 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1408 webpage, u'download list').strip()
1410 # Get all of the links from the page
1411 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1412 links = re.findall(LINK_RE, download_list_html)
1413 if(len(links) == 0):
1414 raise ExtractorError(u'ERROR: no known formats available for video')
1416 self.to_screen(u'Links found: %d' % len(links))
1421 # A link looks like this:
1422 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1423 # A path looks like this:
1424 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1425 video_url = unescapeHTML( link )
1426 path = compat_urllib_parse_urlparse( video_url ).path
1427 extension = os.path.splitext( path )[1][1:]
1428 format = path.split('/')[4].split('_')[:2]
1431 format = "-".join( format )
1432 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1437 'uploader': video_uploader,
1438 'upload_date': upload_date,
1439 'title': video_title,
1442 'thumbnail': thumbnail,
1443 'description': video_description
1446 if self._downloader.params.get('listformats', None):
1447 self._print_formats(formats)
1450 req_format = self._downloader.params.get('format', None)
1451 self.to_screen(u'Format: %s' % req_format)
1453 if req_format is None or req_format == 'best':
1455 elif req_format == 'worst':
1456 return [formats[-1]]
1457 elif req_format in ('-1', 'all'):
1460 format = self._specific( req_format, formats )
1462 raise ExtractorError(u'Requested format not available')
1467 class PornotubeIE(InfoExtractor):
1468 """Information extractor for pornotube.com."""
1469 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1471 def _real_extract(self, url):
1472 mobj = re.match(self._VALID_URL, url)
1474 raise ExtractorError(u'Invalid URL: %s' % url)
1476 video_id = mobj.group('videoid')
1477 video_title = mobj.group('title')
1479 # Get webpage content
1480 webpage = self._download_webpage(url, video_id)
1483 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1484 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1485 video_url = compat_urllib_parse.unquote(video_url)
1487 #Get the uploaded date
1488 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1489 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1490 if upload_date: upload_date = unified_strdate(upload_date)
1492 info = {'id': video_id,
1495 'upload_date': upload_date,
1496 'title': video_title,
1502 class YouJizzIE(InfoExtractor):
1503 """Information extractor for youjizz.com."""
1504 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1506 def _real_extract(self, url):
1507 mobj = re.match(self._VALID_URL, url)
1509 raise ExtractorError(u'Invalid URL: %s' % url)
1511 video_id = mobj.group('videoid')
1513 # Get webpage content
1514 webpage = self._download_webpage(url, video_id)
1516 # Get the video title
1517 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1518 webpage, u'title').strip()
1520 # Get the embed page
1521 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1523 raise ExtractorError(u'ERROR: unable to extract embed page')
1525 embed_page_url = result.group(0).strip()
1526 video_id = result.group('videoid')
1528 webpage = self._download_webpage(embed_page_url, video_id)
1531 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1532 webpage, u'video URL')
1534 info = {'id': video_id,
1536 'title': video_title,
1539 'player_url': embed_page_url}
1543 class EightTracksIE(InfoExtractor):
1545 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1547 def _real_extract(self, url):
1548 mobj = re.match(self._VALID_URL, url)
1550 raise ExtractorError(u'Invalid URL: %s' % url)
1551 playlist_id = mobj.group('id')
1553 webpage = self._download_webpage(url, playlist_id)
1555 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1556 data = json.loads(json_like)
1558 session = str(random.randint(0, 1000000000))
1560 track_count = data['tracks_count']
1561 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1562 next_url = first_url
1564 for i in itertools.count():
1565 api_json = self._download_webpage(next_url, playlist_id,
1566 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1567 errnote=u'Failed to download song information')
1568 api_data = json.loads(api_json)
1569 track_data = api_data[u'set']['track']
1571 'id': track_data['id'],
1572 'url': track_data['track_file_stream_url'],
1573 'title': track_data['performer'] + u' - ' + track_data['name'],
1574 'raw_title': track_data['name'],
1575 'uploader_id': data['user']['login'],
1579 if api_data['set']['at_last_track']:
1581 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1584 class KeekIE(InfoExtractor):
1585 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1588 def _real_extract(self, url):
1589 m = re.match(self._VALID_URL, url)
1590 video_id = m.group('videoID')
1592 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1593 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1594 webpage = self._download_webpage(url, video_id)
1596 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1599 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1600 webpage, u'uploader', fatal=False)
1606 'title': video_title,
1607 'thumbnail': thumbnail,
1608 'uploader': uploader
1612 class TEDIE(InfoExtractor):
1613 _VALID_URL=r'''http://www\.ted\.com/
1615 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1617 ((?P<type_talk>talks)) # We have a simple talk
1619 (/lang/(.*?))? # The url may contain the language
1620 /(?P<name>\w+) # Here goes the name and then ".html"
1624 def suitable(cls, url):
1625 """Receives a URL and returns True if suitable for this IE."""
1626 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1628 def _real_extract(self, url):
1629 m=re.match(self._VALID_URL, url, re.VERBOSE)
1630 if m.group('type_talk'):
1631 return [self._talk_info(url)]
1633 playlist_id=m.group('playlist_id')
1634 name=m.group('name')
1635 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1636 return [self._playlist_videos_info(url,name,playlist_id)]
1638 def _playlist_videos_info(self,url,name,playlist_id=0):
1639 '''Returns the videos of the playlist'''
1641 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1642 ([.\s]*?)data-playlist_item_id="(\d+)"
1643 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1645 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1646 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1647 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1648 m_names=re.finditer(video_name_RE,webpage)
1650 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1651 webpage, 'playlist title')
1653 playlist_entries = []
1654 for m_video, m_name in zip(m_videos,m_names):
1655 video_id=m_video.group('video_id')
1656 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1657 playlist_entries.append(self.url_result(talk_url, 'TED'))
1658 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1660 def _talk_info(self, url, video_id=0):
1661 """Return the video for the talk in the url"""
1662 m = re.match(self._VALID_URL, url,re.VERBOSE)
1663 video_name = m.group('name')
1664 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1665 self.report_extraction(video_name)
1666 # If the url includes the language we get the title translated
1667 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1669 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1670 webpage, 'json data')
1671 info = json.loads(json_data)
1672 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1673 webpage, 'description', flags = re.DOTALL)
1675 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1676 webpage, 'thumbnail')
1679 'url': info['htmlStreams'][-1]['file'],
1682 'thumbnail': thumbnail,
1683 'description': desc,
1687 class MySpassIE(InfoExtractor):
1688 _VALID_URL = r'http://www.myspass.de/.*'
1690 def _real_extract(self, url):
1691 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1693 # video id is the last path element of the URL
1694 # usually there is a trailing slash, so also try the second but last
1695 url_path = compat_urllib_parse_urlparse(url).path
1696 url_parent_path, video_id = os.path.split(url_path)
1698 _, video_id = os.path.split(url_parent_path)
1701 metadata_url = META_DATA_URL_TEMPLATE % video_id
1702 metadata_text = self._download_webpage(metadata_url, video_id)
1703 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1705 # extract values from metadata
1706 url_flv_el = metadata.find('url_flv')
1707 if url_flv_el is None:
1708 raise ExtractorError(u'Unable to extract download url')
1709 video_url = url_flv_el.text
1710 extension = os.path.splitext(video_url)[1][1:]
1711 title_el = metadata.find('title')
1712 if title_el is None:
1713 raise ExtractorError(u'Unable to extract title')
1714 title = title_el.text
1715 format_id_el = metadata.find('format_id')
1716 if format_id_el is None:
1719 format = format_id_el.text
1720 description_el = metadata.find('description')
1721 if description_el is not None:
1722 description = description_el.text
1725 imagePreview_el = metadata.find('imagePreview')
1726 if imagePreview_el is not None:
1727 thumbnail = imagePreview_el.text
1736 'thumbnail': thumbnail,
1737 'description': description
1741 class SpiegelIE(InfoExtractor):
1742 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1744 def _real_extract(self, url):
1745 m = re.match(self._VALID_URL, url)
1746 video_id = m.group('videoID')
1748 webpage = self._download_webpage(url, video_id)
1750 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1753 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1754 xml_code = self._download_webpage(xml_url, video_id,
1755 note=u'Downloading XML', errnote=u'Failed to download XML')
1757 idoc = xml.etree.ElementTree.fromstring(xml_code)
1758 last_type = idoc[-1]
1759 filename = last_type.findall('./filename')[0].text
1760 duration = float(last_type.findall('./duration')[0].text)
1762 video_url = 'http://video2.spiegel.de/flash/' + filename
1763 video_ext = filename.rpartition('.')[2]
1768 'title': video_title,
1769 'duration': duration,
1773 class LiveLeakIE(InfoExtractor):
1775 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1776 IE_NAME = u'liveleak'
1778 def _real_extract(self, url):
1779 mobj = re.match(self._VALID_URL, url)
1781 raise ExtractorError(u'Invalid URL: %s' % url)
1783 video_id = mobj.group('video_id')
1785 webpage = self._download_webpage(url, video_id)
1787 video_url = self._search_regex(r'file: "(.*?)",',
1788 webpage, u'video URL')
1790 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1791 webpage, u'title').replace('LiveLeak.com -', '').strip()
1793 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1794 webpage, u'description', fatal=False)
1796 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1797 webpage, u'uploader', fatal=False)
1803 'title': video_title,
1804 'description': video_description,
1805 'uploader': video_uploader
1812 class TumblrIE(InfoExtractor):
1813 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1815 def _real_extract(self, url):
1816 m_url = re.match(self._VALID_URL, url)
1817 video_id = m_url.group('id')
1818 blog = m_url.group('blog_name')
1820 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1821 webpage = self._download_webpage(url, video_id)
1823 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1824 video = re.search(re_video, webpage)
1826 raise ExtractorError(u'Unable to extract video')
1827 video_url = video.group('video_url')
1828 ext = video.group('ext')
1830 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1831 webpage, u'thumbnail', fatal=False) # We pick the first poster
1832 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1834 # The only place where you can get a title, it's not complete,
1835 # but searching in other places doesn't work for all videos
1836 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1837 webpage, u'title', flags=re.DOTALL)
1839 return [{'id': video_id,
1841 'title': video_title,
1842 'thumbnail': video_thumbnail,
1846 class BandcampIE(InfoExtractor):
1847 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1849 def _real_extract(self, url):
1850 mobj = re.match(self._VALID_URL, url)
1851 title = mobj.group('title')
1852 webpage = self._download_webpage(url, title)
1853 # We get the link to the free download page
1854 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1855 if m_download is None:
1856 raise ExtractorError(u'No free songs found')
1858 download_link = m_download.group(1)
1859 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1860 webpage, re.MULTILINE|re.DOTALL).group('id')
1862 download_webpage = self._download_webpage(download_link, id,
1863 'Downloading free downloads page')
1864 # We get the dictionary of the track from some javascrip code
1865 info = re.search(r'items: (.*?),$',
1866 download_webpage, re.MULTILINE).group(1)
1867 info = json.loads(info)[0]
1868 # We pick mp3-320 for now, until format selection can be easily implemented.
1869 mp3_info = info[u'downloads'][u'mp3-320']
1870 # If we try to use this url it says the link has expired
1871 initial_url = mp3_info[u'url']
1872 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1873 m_url = re.match(re_url, initial_url)
1874 #We build the url we will use to get the final track url
1875 # This url is build in Bandcamp in the script download_bunde_*.js
1876 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1877 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1878 # If we could correctly generate the .rand field the url would be
1879 #in the "download_url" key
1880 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1882 track_info = {'id':id,
1883 'title' : info[u'title'],
1886 'thumbnail' : info[u'thumb_url'],
1887 'uploader' : info[u'artist']
1892 class RedTubeIE(InfoExtractor):
1893 """Information Extractor for redtube"""
1894 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1896 def _real_extract(self,url):
1897 mobj = re.match(self._VALID_URL, url)
1899 raise ExtractorError(u'Invalid URL: %s' % url)
1901 video_id = mobj.group('id')
1902 video_extension = 'mp4'
1903 webpage = self._download_webpage(url, video_id)
1905 self.report_extraction(video_id)
1907 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1908 webpage, u'video URL')
1910 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1916 'ext': video_extension,
1917 'title': video_title,
1920 class InaIE(InfoExtractor):
1921 """Information Extractor for Ina.fr"""
1922 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1924 def _real_extract(self,url):
1925 mobj = re.match(self._VALID_URL, url)
1927 video_id = mobj.group('id')
1928 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1929 video_extension = 'mp4'
1930 webpage = self._download_webpage(mrss_url, video_id)
1932 self.report_extraction(video_id)
1934 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1935 webpage, u'video URL')
1937 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1943 'ext': video_extension,
1944 'title': video_title,
1947 class HowcastIE(InfoExtractor):
1948 """Information Extractor for Howcast.com"""
1949 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1951 def _real_extract(self, url):
1952 mobj = re.match(self._VALID_URL, url)
1954 video_id = mobj.group('id')
1955 webpage_url = 'http://www.howcast.com/videos/' + video_id
1956 webpage = self._download_webpage(webpage_url, video_id)
1958 self.report_extraction(video_id)
1960 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1961 webpage, u'video URL')
1963 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1966 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1967 webpage, u'description', fatal=False)
1969 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1970 webpage, u'thumbnail', fatal=False)
1976 'title': video_title,
1977 'description': video_description,
1978 'thumbnail': thumbnail,
1981 class VineIE(InfoExtractor):
1982 """Information Extractor for Vine.co"""
1983 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1985 def _real_extract(self, url):
1986 mobj = re.match(self._VALID_URL, url)
1988 video_id = mobj.group('id')
1989 webpage_url = 'https://vine.co/v/' + video_id
1990 webpage = self._download_webpage(webpage_url, video_id)
1992 self.report_extraction(video_id)
1994 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1995 webpage, u'video URL')
1997 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2000 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2001 webpage, u'thumbnail', fatal=False)
2003 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2004 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2010 'title': video_title,
2011 'thumbnail': thumbnail,
2012 'uploader': uploader,
2015 class FlickrIE(InfoExtractor):
2016 """Information Extractor for Flickr videos"""
2017 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2019 def _real_extract(self, url):
2020 mobj = re.match(self._VALID_URL, url)
2022 video_id = mobj.group('id')
2023 video_uploader_id = mobj.group('uploader_id')
2024 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2025 webpage = self._download_webpage(webpage_url, video_id)
2027 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2029 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2030 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2032 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2033 first_xml, u'node_id')
2035 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2036 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2038 self.report_extraction(video_id)
2040 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2042 raise ExtractorError(u'Unable to extract video url')
2043 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2045 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2046 webpage, u'video title')
2048 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2049 webpage, u'description', fatal=False)
2051 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2052 webpage, u'thumbnail', fatal=False)
2058 'title': video_title,
2059 'description': video_description,
2060 'thumbnail': thumbnail,
2061 'uploader_id': video_uploader_id,
2064 class TeamcocoIE(InfoExtractor):
2065 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2067 def _real_extract(self, url):
2068 mobj = re.match(self._VALID_URL, url)
2070 raise ExtractorError(u'Invalid URL: %s' % url)
2071 url_title = mobj.group('url_title')
2072 webpage = self._download_webpage(url, url_title)
2074 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2075 webpage, u'video id')
2077 self.report_extraction(video_id)
2079 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2082 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2083 webpage, u'thumbnail', fatal=False)
2085 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2086 webpage, u'description', fatal=False)
2088 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2089 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2091 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2098 'title': video_title,
2099 'thumbnail': thumbnail,
2100 'description': video_description,
2103 class XHamsterIE(InfoExtractor):
2104 """Information Extractor for xHamster"""
2105 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2107 def _real_extract(self,url):
2108 mobj = re.match(self._VALID_URL, url)
2110 video_id = mobj.group('id')
2111 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2112 webpage = self._download_webpage(mrss_url, video_id)
2114 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2116 raise ExtractorError(u'Unable to extract media URL')
2117 if len(mobj.group('server')) == 0:
2118 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2120 video_url = mobj.group('server')+'/key='+mobj.group('file')
2121 video_extension = video_url.split('.')[-1]
2123 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2126 # Can't see the description anywhere in the UI
2127 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2128 # webpage, u'description', fatal=False)
2129 # if video_description: video_description = unescapeHTML(video_description)
2131 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2133 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2135 video_upload_date = None
2136 self._downloader.report_warning(u'Unable to extract upload date')
2138 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2139 webpage, u'uploader id', default=u'anonymous')
2141 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2142 webpage, u'thumbnail', fatal=False)
2147 'ext': video_extension,
2148 'title': video_title,
2149 # 'description': video_description,
2150 'upload_date': video_upload_date,
2151 'uploader_id': video_uploader_id,
2152 'thumbnail': video_thumbnail
2155 class HypemIE(InfoExtractor):
2156 """Information Extractor for hypem"""
2157 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2159 def _real_extract(self, url):
2160 mobj = re.match(self._VALID_URL, url)
2162 raise ExtractorError(u'Invalid URL: %s' % url)
2163 track_id = mobj.group(1)
2165 data = { 'ax': 1, 'ts': time.time() }
2166 data_encoded = compat_urllib_parse.urlencode(data)
2167 complete_url = url + "?" + data_encoded
2168 request = compat_urllib_request.Request(complete_url)
2169 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2170 cookie = urlh.headers.get('Set-Cookie', '')
2172 self.report_extraction(track_id)
2174 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2175 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2177 track_list = json.loads(html_tracks)
2178 track = track_list[u'tracks'][0]
2180 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2183 track_id = track[u"id"]
2184 artist = track[u"artist"]
2185 title = track[u"song"]
2187 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2188 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2189 request.add_header('cookie', cookie)
2190 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2192 song_data = json.loads(song_data_json)
2194 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2195 final_url = song_data[u"url"]
2205 class Vbox7IE(InfoExtractor):
2206 """Information Extractor for Vbox7"""
2207 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2209 def _real_extract(self,url):
2210 mobj = re.match(self._VALID_URL, url)
2212 raise ExtractorError(u'Invalid URL: %s' % url)
2213 video_id = mobj.group(1)
2215 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2216 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2217 redirect_url = urlh.geturl() + new_location
2218 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2220 title = self._html_search_regex(r'<title>(.*)</title>',
2221 webpage, u'title').split('/')[0].strip()
2224 info_url = "http://vbox7.com/play/magare.do"
2225 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2226 info_request = compat_urllib_request.Request(info_url, data)
2227 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2228 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2229 if info_response is None:
2230 raise ExtractorError(u'Unable to extract the media url')
2231 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2238 'thumbnail': thumbnail_url,
2242 def gen_extractors():
2243 """ Return a list of an instance of every supported extractor.
2244 The order does matter; the first extractor matched is the one handling the URL.
2247 YoutubePlaylistIE(),
2272 StanfordOpenClassroomIE(),
2282 WorldStarHipHopIE(),
2312 def get_info_extractor(ie_name):
2313 """Returns the info extractor class with the given ie_name"""
2314 return globals()[ie_name+'IE']