10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.myvideo import MyVideoIE
37 from .extractor.statigram import StatigramIE
38 from .extractor.photobucket import PhotobucketIE
39 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
40 from .extractor.stanfordoc import StanfordOpenClassroomIE
41 from .extractor.vimeo import VimeoIE
42 from .extractor.xvideos import XVideosIE
43 from .extractor.yahoo import YahooIE, YahooSearchIE
44 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
45 from .extractor.zdf import ZDFIE
49 class MixcloudIE(InfoExtractor):
50 """Information extractor for www.mixcloud.com"""
52 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
53 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
56 def report_download_json(self, file_id):
57 """Report JSON download."""
58 self.to_screen(u'Downloading json')
60 def get_urls(self, jsonData, fmt, bitrate='best'):
61 """Get urls from 'audio_formats' section in json"""
64 bitrate_list = jsonData[fmt]
65 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
66 bitrate = max(bitrate_list) # select highest
68 url_list = jsonData[fmt][bitrate]
69 except TypeError: # we have no bitrate info.
70 url_list = jsonData[fmt]
73 def check_urls(self, url_list):
74 """Returns 1st active url from list"""
77 compat_urllib_request.urlopen(url)
79 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
84 def _print_formats(self, formats):
85 print('Available formats:')
86 for fmt in formats.keys():
87 for b in formats[fmt]:
89 ext = formats[fmt][b][0]
90 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
91 except TypeError: # we have no bitrate info
93 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
96 def _real_extract(self, url):
97 mobj = re.match(self._VALID_URL, url)
99 raise ExtractorError(u'Invalid URL: %s' % url)
100 # extract uploader & filename from url
101 uploader = mobj.group(1).decode('utf-8')
102 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
104 # construct API request
105 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
106 # retrieve .json file with links to files
107 request = compat_urllib_request.Request(file_url)
109 self.report_download_json(file_url)
110 jsonData = compat_urllib_request.urlopen(request).read()
111 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
112 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
115 json_data = json.loads(jsonData)
116 player_url = json_data['player_swf_url']
117 formats = dict(json_data['audio_formats'])
119 req_format = self._downloader.params.get('format', None)
122 if self._downloader.params.get('listformats', None):
123 self._print_formats(formats)
126 if req_format is None or req_format == 'best':
127 for format_param in formats.keys():
128 url_list = self.get_urls(formats, format_param)
130 file_url = self.check_urls(url_list)
131 if file_url is not None:
134 if req_format not in formats:
135 raise ExtractorError(u'Format is not available')
137 url_list = self.get_urls(formats, req_format)
138 file_url = self.check_urls(url_list)
139 format_param = req_format
142 'id': file_id.decode('utf-8'),
143 'url': file_url.decode('utf-8'),
144 'uploader': uploader.decode('utf-8'),
146 'title': json_data['name'],
147 'ext': file_url.split('.')[-1].decode('utf-8'),
148 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
149 'thumbnail': json_data['thumbnail_url'],
150 'description': json_data['description'],
151 'player_url': player_url.decode('utf-8'),
155 class MTVIE(InfoExtractor):
156 """Information extractor for MTV.com"""
158 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
161 def _real_extract(self, url):
162 mobj = re.match(self._VALID_URL, url)
164 raise ExtractorError(u'Invalid URL: %s' % url)
165 if not mobj.group('proto'):
166 url = 'http://' + url
167 video_id = mobj.group('videoid')
169 webpage = self._download_webpage(url, video_id)
171 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
172 webpage, u'song name', fatal=False)
174 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
177 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
178 webpage, u'mtvn_uri', fatal=False)
180 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
181 webpage, u'content id', fatal=False)
183 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
184 self.report_extraction(video_id)
185 request = compat_urllib_request.Request(videogen_url)
187 metadataXml = compat_urllib_request.urlopen(request).read()
188 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
189 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
191 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
192 renditions = mdoc.findall('.//rendition')
194 # For now, always pick the highest quality.
195 rendition = renditions[-1]
198 _,_,ext = rendition.attrib['type'].partition('/')
199 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
200 video_url = rendition.find('./src').text
202 raise ExtractorError('Invalid rendition field.')
207 'uploader': performer,
209 'title': video_title,
217 class YoukuIE(InfoExtractor):
218 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
221 nowTime = int(time.time() * 1000)
222 random1 = random.randint(1000,1998)
223 random2 = random.randint(1000,9999)
225 return "%d%d%d" %(nowTime,random1,random2)
227 def _get_file_ID_mix_string(self, seed):
229 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
231 for i in range(len(source)):
232 seed = (seed * 211 + 30031 ) % 65536
233 index = math.floor(seed / 65536 * len(source) )
234 mixed.append(source[int(index)])
235 source.remove(source[int(index)])
236 #return ''.join(mixed)
239 def _get_file_id(self, fileId, seed):
240 mixed = self._get_file_ID_mix_string(seed)
241 ids = fileId.split('*')
245 realId.append(mixed[int(ch)])
246 return ''.join(realId)
248 def _real_extract(self, url):
249 mobj = re.match(self._VALID_URL, url)
251 raise ExtractorError(u'Invalid URL: %s' % url)
252 video_id = mobj.group('ID')
254 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
256 jsondata = self._download_webpage(info_url, video_id)
258 self.report_extraction(video_id)
260 config = json.loads(jsondata)
262 video_title = config['data'][0]['title']
263 seed = config['data'][0]['seed']
265 format = self._downloader.params.get('format', None)
266 supported_format = list(config['data'][0]['streamfileids'].keys())
268 if format is None or format == 'best':
269 if 'hd2' in supported_format:
274 elif format == 'worst':
282 fileid = config['data'][0]['streamfileids'][format]
283 keys = [s['k'] for s in config['data'][0]['segs'][format]]
284 except (UnicodeDecodeError, ValueError, KeyError):
285 raise ExtractorError(u'Unable to extract info section')
288 sid = self._gen_sid()
289 fileid = self._get_file_id(fileid, seed)
291 #column 8,9 of fileid represent the segment number
292 #fileid[7:9] should be changed
293 for index, key in enumerate(keys):
295 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
296 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
299 'id': '%s_part%02d' % (video_id, index),
303 'title': video_title,
306 files_info.append(info)
311 class XNXXIE(InfoExtractor):
312 """Information extractor for xnxx.com"""
314 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
316 VIDEO_URL_RE = r'flv_url=(.*?)&'
317 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
318 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
320 def _real_extract(self, url):
321 mobj = re.match(self._VALID_URL, url)
323 raise ExtractorError(u'Invalid URL: %s' % url)
324 video_id = mobj.group(1)
326 # Get webpage content
327 webpage = self._download_webpage(url, video_id)
329 video_url = self._search_regex(self.VIDEO_URL_RE,
330 webpage, u'video URL')
331 video_url = compat_urllib_parse.unquote(video_url)
333 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
336 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
337 webpage, u'thumbnail', fatal=False)
344 'title': video_title,
346 'thumbnail': video_thumbnail,
352 class NBAIE(InfoExtractor):
353 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
356 def _real_extract(self, url):
357 mobj = re.match(self._VALID_URL, url)
359 raise ExtractorError(u'Invalid URL: %s' % url)
361 video_id = mobj.group(1)
363 webpage = self._download_webpage(url, video_id)
365 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
367 shortened_video_id = video_id.rpartition('/')[2]
368 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
369 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
371 # It isn't there in the HTML it returns to us
372 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
374 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
377 'id': shortened_video_id,
381 # 'uploader_date': uploader_date,
382 'description': description,
386 class JustinTVIE(InfoExtractor):
387 """Information extractor for justin.tv and twitch.tv"""
388 # TODO: One broadcast may be split into multiple videos. The key
389 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
390 # starts at 1 and increases. Can we treat all parts as one video?
392 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
394 (?P<channelid>[^/]+)|
395 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
396 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
400 _JUSTIN_PAGE_LIMIT = 100
401 IE_NAME = u'justin.tv'
403 def report_download_page(self, channel, offset):
404 """Report attempt to download a single page of videos."""
405 self.to_screen(u'%s: Downloading video information from %d to %d' %
406 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
408 # Return count of items, list of *valid* items
409 def _parse_page(self, url, video_id):
410 webpage = self._download_webpage(url, video_id,
411 u'Downloading video info JSON',
412 u'unable to download video info JSON')
414 response = json.loads(webpage)
415 if type(response) != list:
416 error_text = response.get('error', 'unknown error')
417 raise ExtractorError(u'Justin.tv API: %s' % error_text)
419 for clip in response:
420 video_url = clip['video_file_url']
422 video_extension = os.path.splitext(video_url)[1][1:]
423 video_date = re.sub('-', '', clip['start_time'][:10])
424 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
425 video_id = clip['id']
426 video_title = clip.get('title', video_id)
430 'title': video_title,
431 'uploader': clip.get('channel_name', video_uploader_id),
432 'uploader_id': video_uploader_id,
433 'upload_date': video_date,
434 'ext': video_extension,
436 return (len(response), info)
438 def _real_extract(self, url):
439 mobj = re.match(self._VALID_URL, url)
441 raise ExtractorError(u'invalid URL: %s' % url)
443 api_base = 'http://api.justin.tv'
445 if mobj.group('channelid'):
447 video_id = mobj.group('channelid')
448 api = api_base + '/channel/archives/%s.json' % video_id
449 elif mobj.group('chapterid'):
450 chapter_id = mobj.group('chapterid')
452 webpage = self._download_webpage(url, chapter_id)
453 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
455 raise ExtractorError(u'Cannot find archive of a chapter')
456 archive_id = m.group(1)
458 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
459 chapter_info_xml = self._download_webpage(api, chapter_id,
460 note=u'Downloading chapter information',
461 errnote=u'Chapter information download failed')
462 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
463 for a in doc.findall('.//archive'):
464 if archive_id == a.find('./id').text:
467 raise ExtractorError(u'Could not find chapter in chapter information')
469 video_url = a.find('./video_file_url').text
470 video_ext = video_url.rpartition('.')[2] or u'flv'
472 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
473 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
474 note='Downloading chapter metadata',
475 errnote='Download of chapter metadata failed')
476 chapter_info = json.loads(chapter_info_json)
478 bracket_start = int(doc.find('.//bracket_start').text)
479 bracket_end = int(doc.find('.//bracket_end').text)
481 # TODO determine start (and probably fix up file)
482 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
483 #video_url += u'?start=' + TODO:start_timestamp
484 # bracket_start is 13290, but we want 51670615
485 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
486 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
489 'id': u'c' + chapter_id,
492 'title': chapter_info['title'],
493 'thumbnail': chapter_info['preview'],
494 'description': chapter_info['description'],
495 'uploader': chapter_info['channel']['display_name'],
496 'uploader_id': chapter_info['channel']['name'],
500 video_id = mobj.group('videoid')
501 api = api_base + '/broadcast/by_archive/%s.json' % video_id
503 self.report_extraction(video_id)
507 limit = self._JUSTIN_PAGE_LIMIT
510 self.report_download_page(video_id, offset)
511 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
512 page_count, page_info = self._parse_page(page_url, video_id)
513 info.extend(page_info)
514 if not paged or page_count != limit:
519 class FunnyOrDieIE(InfoExtractor):
520 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
522 def _real_extract(self, url):
523 mobj = re.match(self._VALID_URL, url)
525 raise ExtractorError(u'invalid URL: %s' % url)
527 video_id = mobj.group('id')
528 webpage = self._download_webpage(url, video_id)
530 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
531 webpage, u'video URL', flags=re.DOTALL)
533 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
534 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
536 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
537 webpage, u'description', fatal=False, flags=re.DOTALL)
544 'description': video_description,
548 class SteamIE(InfoExtractor):
549 _VALID_URL = r"""http://store\.steampowered\.com/
551 (?P<urltype>video|app)/ #If the page is only for videos or for a game
553 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
555 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
556 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
559 def suitable(cls, url):
560 """Receives a URL and returns True if suitable for this IE."""
561 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
563 def _real_extract(self, url):
564 m = re.match(self._VALID_URL, url, re.VERBOSE)
565 gameID = m.group('gameID')
567 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
568 webpage = self._download_webpage(videourl, gameID)
570 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
571 videourl = self._AGECHECK_TEMPLATE % gameID
572 self.report_age_confirmation()
573 webpage = self._download_webpage(videourl, gameID)
575 self.report_extraction(gameID)
576 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
577 webpage, 'game title')
579 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
580 mweb = re.finditer(urlRE, webpage)
581 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
582 titles = re.finditer(namesRE, webpage)
583 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
584 thumbs = re.finditer(thumbsRE, webpage)
586 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
587 video_id = vid.group('videoID')
588 title = vtitle.group('videoName')
589 video_url = vid.group('videoURL')
590 video_thumb = thumb.group('thumbnail')
592 raise ExtractorError(u'Cannot find video url for %s' % video_id)
597 'title': unescapeHTML(title),
598 'thumbnail': video_thumb
601 return [self.playlist_result(videos, gameID, game_title)]
603 class UstreamIE(InfoExtractor):
604 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
607 def _real_extract(self, url):
608 m = re.match(self._VALID_URL, url)
609 video_id = m.group('videoID')
611 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
612 webpage = self._download_webpage(url, video_id)
614 self.report_extraction(video_id)
616 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
619 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
620 webpage, u'uploader', fatal=False, flags=re.DOTALL)
622 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
623 webpage, u'thumbnail', fatal=False)
629 'title': video_title,
630 'uploader': uploader,
631 'thumbnail': thumbnail,
635 class WorldStarHipHopIE(InfoExtractor):
636 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
637 IE_NAME = u'WorldStarHipHop'
639 def _real_extract(self, url):
640 m = re.match(self._VALID_URL, url)
641 video_id = m.group('id')
643 webpage_src = self._download_webpage(url, video_id)
645 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
646 webpage_src, u'video URL')
648 if 'mp4' in video_url:
653 video_title = self._html_search_regex(r"<title>(.*)</title>",
654 webpage_src, u'title')
656 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
657 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
658 webpage_src, u'thumbnail', fatal=False)
661 _title = r"""candytitles.*>(.*)</span>"""
662 mobj = re.search(_title, webpage_src)
664 video_title = mobj.group(1)
669 'title' : video_title,
670 'thumbnail' : thumbnail,
675 class RBMARadioIE(InfoExtractor):
676 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
678 def _real_extract(self, url):
679 m = re.match(self._VALID_URL, url)
680 video_id = m.group('videoID')
682 webpage = self._download_webpage(url, video_id)
684 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
685 webpage, u'json data', flags=re.MULTILINE)
688 data = json.loads(json_data)
689 except ValueError as e:
690 raise ExtractorError(u'Invalid JSON: ' + str(e))
692 video_url = data['akamai_url'] + '&cbr=256'
693 url_parts = compat_urllib_parse_urlparse(video_url)
694 video_ext = url_parts.path.rpartition('.')[2]
699 'title': data['title'],
700 'description': data.get('teaser_text'),
701 'location': data.get('country_of_origin'),
702 'uploader': data.get('host', {}).get('name'),
703 'uploader_id': data.get('host', {}).get('slug'),
704 'thumbnail': data.get('image', {}).get('large_url_2x'),
705 'duration': data.get('duration'),
710 class YouPornIE(InfoExtractor):
711 """Information extractor for youporn.com."""
712 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
714 def _print_formats(self, formats):
715 """Print all available formats"""
716 print(u'Available formats:')
717 print(u'ext\t\tformat')
718 print(u'---------------------------------')
719 for format in formats:
720 print(u'%s\t\t%s' % (format['ext'], format['format']))
722 def _specific(self, req_format, formats):
724 if(x["format"]==req_format):
728 def _real_extract(self, url):
729 mobj = re.match(self._VALID_URL, url)
731 raise ExtractorError(u'Invalid URL: %s' % url)
732 video_id = mobj.group('videoid')
734 req = compat_urllib_request.Request(url)
735 req.add_header('Cookie', 'age_verified=1')
736 webpage = self._download_webpage(req, video_id)
738 # Get JSON parameters
739 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
741 params = json.loads(json_params)
743 raise ExtractorError(u'Invalid JSON')
745 self.report_extraction(video_id)
747 video_title = params['title']
748 upload_date = unified_strdate(params['release_date_f'])
749 video_description = params['description']
750 video_uploader = params['submitted_by']
751 thumbnail = params['thumbnails'][0]['image']
753 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
755 # Get all of the formats available
756 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
757 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
758 webpage, u'download list').strip()
760 # Get all of the links from the page
761 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
762 links = re.findall(LINK_RE, download_list_html)
764 raise ExtractorError(u'ERROR: no known formats available for video')
766 self.to_screen(u'Links found: %d' % len(links))
771 # A link looks like this:
772 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
773 # A path looks like this:
774 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
775 video_url = unescapeHTML( link )
776 path = compat_urllib_parse_urlparse( video_url ).path
777 extension = os.path.splitext( path )[1][1:]
778 format = path.split('/')[4].split('_')[:2]
781 format = "-".join( format )
782 # title = u'%s-%s-%s' % (video_title, size, bitrate)
787 'uploader': video_uploader,
788 'upload_date': upload_date,
789 'title': video_title,
792 'thumbnail': thumbnail,
793 'description': video_description
796 if self._downloader.params.get('listformats', None):
797 self._print_formats(formats)
800 req_format = self._downloader.params.get('format', None)
801 self.to_screen(u'Format: %s' % req_format)
803 if req_format is None or req_format == 'best':
805 elif req_format == 'worst':
807 elif req_format in ('-1', 'all'):
810 format = self._specific( req_format, formats )
812 raise ExtractorError(u'Requested format not available')
817 class PornotubeIE(InfoExtractor):
818 """Information extractor for pornotube.com."""
819 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
821 def _real_extract(self, url):
822 mobj = re.match(self._VALID_URL, url)
824 raise ExtractorError(u'Invalid URL: %s' % url)
826 video_id = mobj.group('videoid')
827 video_title = mobj.group('title')
829 # Get webpage content
830 webpage = self._download_webpage(url, video_id)
833 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
834 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
835 video_url = compat_urllib_parse.unquote(video_url)
837 #Get the uploaded date
838 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
839 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
840 if upload_date: upload_date = unified_strdate(upload_date)
842 info = {'id': video_id,
845 'upload_date': upload_date,
846 'title': video_title,
852 class YouJizzIE(InfoExtractor):
853 """Information extractor for youjizz.com."""
854 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
856 def _real_extract(self, url):
857 mobj = re.match(self._VALID_URL, url)
859 raise ExtractorError(u'Invalid URL: %s' % url)
861 video_id = mobj.group('videoid')
863 # Get webpage content
864 webpage = self._download_webpage(url, video_id)
866 # Get the video title
867 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
868 webpage, u'title').strip()
871 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
873 raise ExtractorError(u'ERROR: unable to extract embed page')
875 embed_page_url = result.group(0).strip()
876 video_id = result.group('videoid')
878 webpage = self._download_webpage(embed_page_url, video_id)
881 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
882 webpage, u'video URL')
884 info = {'id': video_id,
886 'title': video_title,
889 'player_url': embed_page_url}
893 class EightTracksIE(InfoExtractor):
895 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
897 def _real_extract(self, url):
898 mobj = re.match(self._VALID_URL, url)
900 raise ExtractorError(u'Invalid URL: %s' % url)
901 playlist_id = mobj.group('id')
903 webpage = self._download_webpage(url, playlist_id)
905 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
906 data = json.loads(json_like)
908 session = str(random.randint(0, 1000000000))
910 track_count = data['tracks_count']
911 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
914 for i in itertools.count():
915 api_json = self._download_webpage(next_url, playlist_id,
916 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
917 errnote=u'Failed to download song information')
918 api_data = json.loads(api_json)
919 track_data = api_data[u'set']['track']
921 'id': track_data['id'],
922 'url': track_data['track_file_stream_url'],
923 'title': track_data['performer'] + u' - ' + track_data['name'],
924 'raw_title': track_data['name'],
925 'uploader_id': data['user']['login'],
929 if api_data['set']['at_last_track']:
931 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
934 class KeekIE(InfoExtractor):
935 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
938 def _real_extract(self, url):
939 m = re.match(self._VALID_URL, url)
940 video_id = m.group('videoID')
942 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
943 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
944 webpage = self._download_webpage(url, video_id)
946 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
949 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
950 webpage, u'uploader', fatal=False)
956 'title': video_title,
957 'thumbnail': thumbnail,
962 class TEDIE(InfoExtractor):
963 _VALID_URL=r'''http://www\.ted\.com/
965 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
967 ((?P<type_talk>talks)) # We have a simple talk
969 (/lang/(.*?))? # The url may contain the language
970 /(?P<name>\w+) # Here goes the name and then ".html"
974 def suitable(cls, url):
975 """Receives a URL and returns True if suitable for this IE."""
976 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
978 def _real_extract(self, url):
979 m=re.match(self._VALID_URL, url, re.VERBOSE)
980 if m.group('type_talk'):
981 return [self._talk_info(url)]
983 playlist_id=m.group('playlist_id')
985 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
986 return [self._playlist_videos_info(url,name,playlist_id)]
988 def _playlist_videos_info(self,url,name,playlist_id=0):
989 '''Returns the videos of the playlist'''
991 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
992 ([.\s]*?)data-playlist_item_id="(\d+)"
993 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
995 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
996 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
997 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
998 m_names=re.finditer(video_name_RE,webpage)
1000 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1001 webpage, 'playlist title')
1003 playlist_entries = []
1004 for m_video, m_name in zip(m_videos,m_names):
1005 video_id=m_video.group('video_id')
1006 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1007 playlist_entries.append(self.url_result(talk_url, 'TED'))
1008 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1010 def _talk_info(self, url, video_id=0):
1011 """Return the video for the talk in the url"""
1012 m = re.match(self._VALID_URL, url,re.VERBOSE)
1013 video_name = m.group('name')
1014 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1015 self.report_extraction(video_name)
1016 # If the url includes the language we get the title translated
1017 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1019 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1020 webpage, 'json data')
1021 info = json.loads(json_data)
1022 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1023 webpage, 'description', flags = re.DOTALL)
1025 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1026 webpage, 'thumbnail')
1029 'url': info['htmlStreams'][-1]['file'],
1032 'thumbnail': thumbnail,
1033 'description': desc,
1037 class MySpassIE(InfoExtractor):
1038 _VALID_URL = r'http://www.myspass.de/.*'
1040 def _real_extract(self, url):
1041 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1043 # video id is the last path element of the URL
1044 # usually there is a trailing slash, so also try the second but last
1045 url_path = compat_urllib_parse_urlparse(url).path
1046 url_parent_path, video_id = os.path.split(url_path)
1048 _, video_id = os.path.split(url_parent_path)
1051 metadata_url = META_DATA_URL_TEMPLATE % video_id
1052 metadata_text = self._download_webpage(metadata_url, video_id)
1053 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1055 # extract values from metadata
1056 url_flv_el = metadata.find('url_flv')
1057 if url_flv_el is None:
1058 raise ExtractorError(u'Unable to extract download url')
1059 video_url = url_flv_el.text
1060 extension = os.path.splitext(video_url)[1][1:]
1061 title_el = metadata.find('title')
1062 if title_el is None:
1063 raise ExtractorError(u'Unable to extract title')
1064 title = title_el.text
1065 format_id_el = metadata.find('format_id')
1066 if format_id_el is None:
1069 format = format_id_el.text
1070 description_el = metadata.find('description')
1071 if description_el is not None:
1072 description = description_el.text
1075 imagePreview_el = metadata.find('imagePreview')
1076 if imagePreview_el is not None:
1077 thumbnail = imagePreview_el.text
1086 'thumbnail': thumbnail,
1087 'description': description
1091 class SpiegelIE(InfoExtractor):
1092 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1094 def _real_extract(self, url):
1095 m = re.match(self._VALID_URL, url)
1096 video_id = m.group('videoID')
1098 webpage = self._download_webpage(url, video_id)
1100 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1103 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1104 xml_code = self._download_webpage(xml_url, video_id,
1105 note=u'Downloading XML', errnote=u'Failed to download XML')
1107 idoc = xml.etree.ElementTree.fromstring(xml_code)
1108 last_type = idoc[-1]
1109 filename = last_type.findall('./filename')[0].text
1110 duration = float(last_type.findall('./duration')[0].text)
1112 video_url = 'http://video2.spiegel.de/flash/' + filename
1113 video_ext = filename.rpartition('.')[2]
1118 'title': video_title,
1119 'duration': duration,
1123 class LiveLeakIE(InfoExtractor):
1125 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1126 IE_NAME = u'liveleak'
1128 def _real_extract(self, url):
1129 mobj = re.match(self._VALID_URL, url)
1131 raise ExtractorError(u'Invalid URL: %s' % url)
1133 video_id = mobj.group('video_id')
1135 webpage = self._download_webpage(url, video_id)
1137 video_url = self._search_regex(r'file: "(.*?)",',
1138 webpage, u'video URL')
1140 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1141 webpage, u'title').replace('LiveLeak.com -', '').strip()
1143 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1144 webpage, u'description', fatal=False)
1146 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1147 webpage, u'uploader', fatal=False)
1153 'title': video_title,
1154 'description': video_description,
1155 'uploader': video_uploader
1162 class TumblrIE(InfoExtractor):
1163 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1165 def _real_extract(self, url):
1166 m_url = re.match(self._VALID_URL, url)
1167 video_id = m_url.group('id')
1168 blog = m_url.group('blog_name')
1170 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1171 webpage = self._download_webpage(url, video_id)
1173 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1174 video = re.search(re_video, webpage)
1176 raise ExtractorError(u'Unable to extract video')
1177 video_url = video.group('video_url')
1178 ext = video.group('ext')
1180 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1181 webpage, u'thumbnail', fatal=False) # We pick the first poster
1182 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1184 # The only place where you can get a title, it's not complete,
1185 # but searching in other places doesn't work for all videos
1186 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1187 webpage, u'title', flags=re.DOTALL)
1189 return [{'id': video_id,
1191 'title': video_title,
1192 'thumbnail': video_thumbnail,
1196 class BandcampIE(InfoExtractor):
1197 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1199 def _real_extract(self, url):
1200 mobj = re.match(self._VALID_URL, url)
1201 title = mobj.group('title')
1202 webpage = self._download_webpage(url, title)
1203 # We get the link to the free download page
1204 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1205 if m_download is None:
1206 raise ExtractorError(u'No free songs found')
1208 download_link = m_download.group(1)
1209 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1210 webpage, re.MULTILINE|re.DOTALL).group('id')
1212 download_webpage = self._download_webpage(download_link, id,
1213 'Downloading free downloads page')
1214 # We get the dictionary of the track from some javascrip code
1215 info = re.search(r'items: (.*?),$',
1216 download_webpage, re.MULTILINE).group(1)
1217 info = json.loads(info)[0]
1218 # We pick mp3-320 for now, until format selection can be easily implemented.
1219 mp3_info = info[u'downloads'][u'mp3-320']
1220 # If we try to use this url it says the link has expired
1221 initial_url = mp3_info[u'url']
1222 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1223 m_url = re.match(re_url, initial_url)
1224 #We build the url we will use to get the final track url
1225 # This url is build in Bandcamp in the script download_bunde_*.js
1226 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1227 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1228 # If we could correctly generate the .rand field the url would be
1229 #in the "download_url" key
1230 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1232 track_info = {'id':id,
1233 'title' : info[u'title'],
1236 'thumbnail' : info[u'thumb_url'],
1237 'uploader' : info[u'artist']
1242 class RedTubeIE(InfoExtractor):
1243 """Information Extractor for redtube"""
1244 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1246 def _real_extract(self,url):
1247 mobj = re.match(self._VALID_URL, url)
1249 raise ExtractorError(u'Invalid URL: %s' % url)
1251 video_id = mobj.group('id')
1252 video_extension = 'mp4'
1253 webpage = self._download_webpage(url, video_id)
1255 self.report_extraction(video_id)
1257 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1258 webpage, u'video URL')
1260 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1266 'ext': video_extension,
1267 'title': video_title,
1270 class InaIE(InfoExtractor):
1271 """Information Extractor for Ina.fr"""
1272 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1274 def _real_extract(self,url):
1275 mobj = re.match(self._VALID_URL, url)
1277 video_id = mobj.group('id')
1278 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1279 video_extension = 'mp4'
1280 webpage = self._download_webpage(mrss_url, video_id)
1282 self.report_extraction(video_id)
1284 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1285 webpage, u'video URL')
1287 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1293 'ext': video_extension,
1294 'title': video_title,
1297 class HowcastIE(InfoExtractor):
1298 """Information Extractor for Howcast.com"""
1299 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1301 def _real_extract(self, url):
1302 mobj = re.match(self._VALID_URL, url)
1304 video_id = mobj.group('id')
1305 webpage_url = 'http://www.howcast.com/videos/' + video_id
1306 webpage = self._download_webpage(webpage_url, video_id)
1308 self.report_extraction(video_id)
1310 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1311 webpage, u'video URL')
1313 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1316 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1317 webpage, u'description', fatal=False)
1319 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1320 webpage, u'thumbnail', fatal=False)
1326 'title': video_title,
1327 'description': video_description,
1328 'thumbnail': thumbnail,
1331 class VineIE(InfoExtractor):
1332 """Information Extractor for Vine.co"""
1333 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1335 def _real_extract(self, url):
1336 mobj = re.match(self._VALID_URL, url)
1338 video_id = mobj.group('id')
1339 webpage_url = 'https://vine.co/v/' + video_id
1340 webpage = self._download_webpage(webpage_url, video_id)
1342 self.report_extraction(video_id)
1344 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1345 webpage, u'video URL')
1347 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1350 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1351 webpage, u'thumbnail', fatal=False)
1353 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1354 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1360 'title': video_title,
1361 'thumbnail': thumbnail,
1362 'uploader': uploader,
1365 class FlickrIE(InfoExtractor):
1366 """Information Extractor for Flickr videos"""
1367 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1369 def _real_extract(self, url):
1370 mobj = re.match(self._VALID_URL, url)
1372 video_id = mobj.group('id')
1373 video_uploader_id = mobj.group('uploader_id')
1374 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1375 webpage = self._download_webpage(webpage_url, video_id)
1377 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1379 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1380 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1382 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1383 first_xml, u'node_id')
1385 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1386 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1388 self.report_extraction(video_id)
1390 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1392 raise ExtractorError(u'Unable to extract video url')
1393 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1395 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1396 webpage, u'video title')
1398 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1399 webpage, u'description', fatal=False)
1401 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1402 webpage, u'thumbnail', fatal=False)
1408 'title': video_title,
1409 'description': video_description,
1410 'thumbnail': thumbnail,
1411 'uploader_id': video_uploader_id,
1414 class TeamcocoIE(InfoExtractor):
1415 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1417 def _real_extract(self, url):
1418 mobj = re.match(self._VALID_URL, url)
1420 raise ExtractorError(u'Invalid URL: %s' % url)
1421 url_title = mobj.group('url_title')
1422 webpage = self._download_webpage(url, url_title)
1424 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1425 webpage, u'video id')
1427 self.report_extraction(video_id)
1429 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1432 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1433 webpage, u'thumbnail', fatal=False)
1435 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1436 webpage, u'description', fatal=False)
1438 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1439 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1441 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1448 'title': video_title,
1449 'thumbnail': thumbnail,
1450 'description': video_description,
1453 class XHamsterIE(InfoExtractor):
1454 """Information Extractor for xHamster"""
1455 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1457 def _real_extract(self,url):
1458 mobj = re.match(self._VALID_URL, url)
1460 video_id = mobj.group('id')
1461 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1462 webpage = self._download_webpage(mrss_url, video_id)
1464 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1466 raise ExtractorError(u'Unable to extract media URL')
1467 if len(mobj.group('server')) == 0:
1468 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1470 video_url = mobj.group('server')+'/key='+mobj.group('file')
1471 video_extension = video_url.split('.')[-1]
1473 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1476 # Can't see the description anywhere in the UI
1477 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1478 # webpage, u'description', fatal=False)
1479 # if video_description: video_description = unescapeHTML(video_description)
1481 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1483 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1485 video_upload_date = None
1486 self._downloader.report_warning(u'Unable to extract upload date')
1488 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1489 webpage, u'uploader id', default=u'anonymous')
1491 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1492 webpage, u'thumbnail', fatal=False)
1497 'ext': video_extension,
1498 'title': video_title,
1499 # 'description': video_description,
1500 'upload_date': video_upload_date,
1501 'uploader_id': video_uploader_id,
1502 'thumbnail': video_thumbnail
1505 class HypemIE(InfoExtractor):
1506 """Information Extractor for hypem"""
1507 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1509 def _real_extract(self, url):
1510 mobj = re.match(self._VALID_URL, url)
1512 raise ExtractorError(u'Invalid URL: %s' % url)
1513 track_id = mobj.group(1)
1515 data = { 'ax': 1, 'ts': time.time() }
1516 data_encoded = compat_urllib_parse.urlencode(data)
1517 complete_url = url + "?" + data_encoded
1518 request = compat_urllib_request.Request(complete_url)
1519 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1520 cookie = urlh.headers.get('Set-Cookie', '')
1522 self.report_extraction(track_id)
1524 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1525 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1527 track_list = json.loads(html_tracks)
1528 track = track_list[u'tracks'][0]
1530 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1533 track_id = track[u"id"]
1534 artist = track[u"artist"]
1535 title = track[u"song"]
1537 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1538 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1539 request.add_header('cookie', cookie)
1540 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1542 song_data = json.loads(song_data_json)
1544 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1545 final_url = song_data[u"url"]
1555 class Vbox7IE(InfoExtractor):
1556 """Information Extractor for Vbox7"""
1557 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1559 def _real_extract(self,url):
1560 mobj = re.match(self._VALID_URL, url)
1562 raise ExtractorError(u'Invalid URL: %s' % url)
1563 video_id = mobj.group(1)
1565 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1566 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1567 redirect_url = urlh.geturl() + new_location
1568 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1570 title = self._html_search_regex(r'<title>(.*)</title>',
1571 webpage, u'title').split('/')[0].strip()
1574 info_url = "http://vbox7.com/play/magare.do"
1575 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1576 info_request = compat_urllib_request.Request(info_url, data)
1577 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1578 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1579 if info_response is None:
1580 raise ExtractorError(u'Unable to extract the media url')
1581 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1588 'thumbnail': thumbnail_url,
1592 def gen_extractors():
1593 """ Return a list of an instance of every supported extractor.
1594 The order does matter; the first extractor matched is the one handling the URL.
1597 YoutubePlaylistIE(),
1622 StanfordOpenClassroomIE(),
1632 WorldStarHipHopIE(),
1662 def get_info_extractor(ie_name):
1663 """Returns the info extractor class with the given ie_name"""
1664 return globals()[ie_name+'IE']