10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.dailymotion import DailymotionIE
24 from .extractor.gametrailers import GametrailersIE
25 from .extractor.generic import GenericIE
26 from .extractor.metacafe import MetacafeIE
27 from .extractor.statigram import StatigramIE
28 from .extractor.photobucket import PhotobucketIE
29 from .extractor.vimeo import VimeoIE
30 from .extractor.yahoo import YahooIE, YahooSearchIE
31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
32 from .extractor.zdf import ZDFIE
50 class BlipTVUserIE(InfoExtractor):
51 """Information Extractor for blip.tv users."""
53 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
55 IE_NAME = u'blip.tv:user'
57 def _real_extract(self, url):
59 mobj = re.match(self._VALID_URL, url)
61 raise ExtractorError(u'Invalid URL: %s' % url)
63 username = mobj.group(1)
65 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
67 page = self._download_webpage(url, username, u'Downloading user page')
68 mobj = re.search(r'data-users-id="([^"]+)"', page)
69 page_base = page_base % mobj.group(1)
72 # Download video ids using BlipTV Ajax calls. Result size per
73 # query is limited (currently to 12 videos) so we need to query
74 # page by page until there are no video ids - it means we got
81 url = page_base + "&page=" + str(pagenum)
82 page = self._download_webpage(url, username,
83 u'Downloading video ids from page %d' % pagenum)
85 # Extract video identifiers
88 for mobj in re.finditer(r'href="/([^"]+)"', page):
89 if mobj.group(1) not in ids_in_page:
90 ids_in_page.append(unescapeHTML(mobj.group(1)))
92 video_ids.extend(ids_in_page)
94 # A little optimization - if current page is not
95 # "full", ie. does not contain PAGE_SIZE video ids then
96 # we can assume that this page is the last one - there
97 # are no more ids on further pages - no need to query
100 if len(ids_in_page) < self._PAGE_SIZE:
105 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
106 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
107 return [self.playlist_result(url_entries, playlist_title = username)]
110 class DepositFilesIE(InfoExtractor):
111 """Information extractor for depositfiles.com"""
113 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
115 def _real_extract(self, url):
116 file_id = url.split('/')[-1]
117 # Rebuild url in english locale
118 url = 'http://depositfiles.com/en/files/' + file_id
120 # Retrieve file webpage with 'Free download' button pressed
121 free_download_indication = { 'gateway_result' : '1' }
122 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
124 self.report_download_webpage(file_id)
125 webpage = compat_urllib_request.urlopen(request).read()
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
129 # Search for the real file URL
130 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
131 if (mobj is None) or (mobj.group(1) is None):
132 # Try to figure out reason of the error.
133 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
134 if (mobj is not None) and (mobj.group(1) is not None):
135 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
136 raise ExtractorError(u'%s' % restriction_message)
138 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
140 file_url = mobj.group(1)
141 file_extension = os.path.splitext(file_url)[1][1:]
143 # Search for file title
144 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
147 'id': file_id.decode('utf-8'),
148 'url': file_url.decode('utf-8'),
152 'ext': file_extension.decode('utf-8'),
156 class FacebookIE(InfoExtractor):
157 """Information Extractor for Facebook"""
159 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
160 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
161 _NETRC_MACHINE = 'facebook'
162 IE_NAME = u'facebook'
164 def report_login(self):
165 """Report attempt to log in."""
166 self.to_screen(u'Logging in')
168 def _real_initialize(self):
169 if self._downloader is None:
174 downloader_params = self._downloader.params
176 # Attempt to use provided username and password or .netrc data
177 if downloader_params.get('username', None) is not None:
178 useremail = downloader_params['username']
179 password = downloader_params['password']
180 elif downloader_params.get('usenetrc', False):
182 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
187 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
188 except (IOError, netrc.NetrcParseError) as err:
189 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
192 if useremail is None:
201 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
204 login_results = compat_urllib_request.urlopen(request).read()
205 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
206 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
209 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
212 def _real_extract(self, url):
213 mobj = re.match(self._VALID_URL, url)
215 raise ExtractorError(u'Invalid URL: %s' % url)
216 video_id = mobj.group('ID')
218 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
219 webpage = self._download_webpage(url, video_id)
221 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
222 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
223 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
225 raise ExtractorError(u'Cannot parse data')
226 data = dict(json.loads(m.group(1)))
227 params_raw = compat_urllib_parse.unquote(data['params'])
228 params = json.loads(params_raw)
229 video_data = params['video_data'][0]
230 video_url = video_data.get('hd_src')
232 video_url = video_data['sd_src']
234 raise ExtractorError(u'Cannot find video URL')
235 video_duration = int(video_data['video_duration'])
236 thumbnail = video_data['thumbnail_src']
238 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
243 'title': video_title,
246 'duration': video_duration,
247 'thumbnail': thumbnail,
252 class BlipTVIE(InfoExtractor):
253 """Information extractor for blip.tv"""
255 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
256 _URL_EXT = r'^.*\.([a-z0-9]+)$'
259 def report_direct_download(self, title):
260 """Report information extraction."""
261 self.to_screen(u'%s: Direct download detected' % title)
263 def _real_extract(self, url):
264 mobj = re.match(self._VALID_URL, url)
266 raise ExtractorError(u'Invalid URL: %s' % url)
268 # See https://github.com/rg3/youtube-dl/issues/857
269 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
270 if api_mobj is not None:
271 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
272 urlp = compat_urllib_parse_urlparse(url)
273 if urlp.path.startswith('/play/'):
274 request = compat_urllib_request.Request(url)
275 response = compat_urllib_request.urlopen(request)
276 redirecturl = response.geturl()
277 rurlp = compat_urllib_parse_urlparse(redirecturl)
278 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
279 url = 'http://blip.tv/a/a-' + file_id
280 return self._real_extract(url)
287 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
288 request = compat_urllib_request.Request(json_url)
289 request.add_header('User-Agent', 'iTunes/10.6.1')
290 self.report_extraction(mobj.group(1))
293 urlh = compat_urllib_request.urlopen(request)
294 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
295 basename = url.split('/')[-1]
296 title,ext = os.path.splitext(basename)
297 title = title.decode('UTF-8')
298 ext = ext.replace('.', '')
299 self.report_direct_download(title)
309 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
310 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
311 if info is None: # Regular URL
313 json_code_bytes = urlh.read()
314 json_code = json_code_bytes.decode('utf-8')
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
319 json_data = json.loads(json_code)
320 if 'Post' in json_data:
321 data = json_data['Post']
325 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
326 video_url = data['media']['url']
327 umobj = re.match(self._URL_EXT, video_url)
329 raise ValueError('Can not determine filename extension')
333 'id': data['item_id'],
335 'uploader': data['display_name'],
336 'upload_date': upload_date,
337 'title': data['title'],
339 'format': data['media']['mimeType'],
340 'thumbnail': data['thumbnailUrl'],
341 'description': data['description'],
342 'player_url': data['embedUrl'],
343 'user_agent': 'iTunes/10.6.1',
345 except (ValueError,KeyError) as err:
346 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
351 class MyVideoIE(InfoExtractor):
352 """Information Extractor for myvideo.de."""
354 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
357 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
358 # Released into the Public Domain by Tristan Fischer on 2013-05-19
359 # https://github.com/rg3/youtube-dl/pull/842
360 def __rc4crypt(self,data, key):
362 box = list(range(256))
363 for i in list(range(256)):
364 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
365 box[i], box[x] = box[x], box[i]
371 y = (y + box[x]) % 256
372 box[x], box[y] = box[y], box[x]
373 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
377 return hashlib.md5(s).hexdigest().encode()
379 def _real_extract(self,url):
380 mobj = re.match(self._VALID_URL, url)
382 raise ExtractorError(u'invalid URL: %s' % url)
384 video_id = mobj.group(1)
387 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
388 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
389 b'TnpsbA0KTVRkbU1tSTRNdz09'
393 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
394 webpage = self._download_webpage(webpage_url, video_id)
396 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
398 self.report_extraction(video_id)
399 video_url = mobj.group(1) + '.flv'
401 video_title = self._html_search_regex('<title>([^<]+)</title>',
404 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
411 'title': video_title,
416 mobj = re.search('var flashvars={(.+?)}', webpage)
418 raise ExtractorError(u'Unable to extract video')
423 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
424 if not a == '_encxml':
427 encxml = compat_urllib_parse.unquote(b)
428 if not params.get('domain'):
429 params['domain'] = 'www.myvideo.de'
430 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
431 if 'flash_playertype=MTV' in xmldata_url:
432 self._downloader.report_warning(u'avoiding MTV player')
434 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
435 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
439 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
440 enc_data_b = binascii.unhexlify(enc_data)
442 base64.b64decode(base64.b64decode(GK)) +
444 str(video_id).encode('utf-8')
447 dec_data = self.__rc4crypt(enc_data_b, sk)
450 self.report_extraction(video_id)
453 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
455 video_url = compat_urllib_parse.unquote(mobj.group(1))
456 if 'myvideo2flash' in video_url:
457 self._downloader.report_warning(u'forcing RTMPT ...')
458 video_url = video_url.replace('rtmpe://', 'rtmpt://')
461 # extract non rtmp videos
462 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
464 raise ExtractorError(u'unable to extract url')
465 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
467 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
468 video_file = compat_urllib_parse.unquote(video_file)
470 if not video_file.endswith('f4m'):
471 ppath, prefix = video_file.split('.')
472 video_playpath = '%s:%s' % (prefix, ppath)
473 video_hls_playlist = ''
476 video_hls_playlist = (
477 video_filepath + video_file
478 ).replace('.f4m', '.m3u8')
480 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
481 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
483 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
492 'title': video_title,
494 'play_path': video_playpath,
495 'video_file': video_file,
496 'video_hls_playlist': video_hls_playlist,
497 'player_url': video_swfobj,
501 class ComedyCentralIE(InfoExtractor):
502 """Information extractor for The Daily Show and Colbert Report """
504 # urls can be abbreviations like :thedailyshow or :colbert
505 # urls for episodes like:
506 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
507 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
508 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
509 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
510 |(https?://)?(www\.)?
511 (?P<showname>thedailyshow|colbertnation)\.com/
512 (full-episodes/(?P<episode>.*)|
514 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
515 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
518 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
520 _video_extensions = {
528 _video_dimensions = {
538 def suitable(cls, url):
539 """Receives a URL and returns True if suitable for this IE."""
540 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
542 def _print_formats(self, formats):
543 print('Available formats:')
545 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
548 def _real_extract(self, url):
549 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
551 raise ExtractorError(u'Invalid URL: %s' % url)
553 if mobj.group('shortname'):
554 if mobj.group('shortname') in ('tds', 'thedailyshow'):
555 url = u'http://www.thedailyshow.com/full-episodes/'
557 url = u'http://www.colbertnation.com/full-episodes/'
558 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
559 assert mobj is not None
561 if mobj.group('clip'):
562 if mobj.group('showname') == 'thedailyshow':
563 epTitle = mobj.group('tdstitle')
565 epTitle = mobj.group('cntitle')
568 dlNewest = not mobj.group('episode')
570 epTitle = mobj.group('showname')
572 epTitle = mobj.group('episode')
574 self.report_extraction(epTitle)
575 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
577 url = htmlHandle.geturl()
578 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
580 raise ExtractorError(u'Invalid redirected URL: ' + url)
581 if mobj.group('episode') == '':
582 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
583 epTitle = mobj.group('episode')
585 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
587 if len(mMovieParams) == 0:
588 # The Colbert Report embeds the information in a without
589 # a URL prefix; so extract the alternate reference
590 # and then add the URL prefix manually.
592 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
593 if len(altMovieParams) == 0:
594 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
596 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
598 uri = mMovieParams[0][1]
599 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
600 indexXml = self._download_webpage(indexUrl, epTitle,
601 u'Downloading show index',
602 u'unable to download episode index')
606 idoc = xml.etree.ElementTree.fromstring(indexXml)
607 itemEls = idoc.findall('.//item')
608 for partNum,itemEl in enumerate(itemEls):
609 mediaId = itemEl.findall('./guid')[0].text
610 shortMediaId = mediaId.split(':')[-1]
611 showId = mediaId.split(':')[-2].replace('.com', '')
612 officialTitle = itemEl.findall('./title')[0].text
613 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
615 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
616 compat_urllib_parse.urlencode({'uri': mediaId}))
617 configXml = self._download_webpage(configUrl, epTitle,
618 u'Downloading configuration for %s' % shortMediaId)
620 cdoc = xml.etree.ElementTree.fromstring(configXml)
622 for rendition in cdoc.findall('.//rendition'):
623 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
627 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats([i[0] for i in turls])
634 # For now, just pick the highest bitrate
635 format,rtmp_video_url = turls[-1]
637 # Get the format arg from the arg stream
638 req_format = self._downloader.params.get('format', None)
640 # Select format if we can find one
643 format, rtmp_video_url = f, v
646 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
648 raise ExtractorError(u'Cannot transform RTMP url')
649 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
650 video_url = base + m.group('finalid')
652 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
657 'upload_date': officialDate,
662 'description': officialTitle,
669 class EscapistIE(InfoExtractor):
670 """Information extractor for The Escapist """
672 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
673 IE_NAME = u'escapist'
675 def _real_extract(self, url):
676 mobj = re.match(self._VALID_URL, url)
678 raise ExtractorError(u'Invalid URL: %s' % url)
679 showName = mobj.group('showname')
680 videoId = mobj.group('episode')
682 self.report_extraction(videoId)
683 webpage = self._download_webpage(url, videoId)
685 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
686 webpage, u'description', fatal=False)
688 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
689 webpage, u'thumbnail', fatal=False)
691 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
692 webpage, u'player url')
694 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
695 webpage, u'player url').split(' : ')[-1]
697 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
698 configUrl = compat_urllib_parse.unquote(configUrl)
700 configJSON = self._download_webpage(configUrl, videoId,
701 u'Downloading configuration',
702 u'unable to download configuration')
704 # Technically, it's JavaScript, not JSON
705 configJSON = configJSON.replace("'", '"')
708 config = json.loads(configJSON)
709 except (ValueError,) as err:
710 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
712 playlist = config['playlist']
713 videoUrl = playlist[1]['url']
718 'uploader': showName,
723 'description': videoDesc,
724 'player_url': playerUrl,
729 class CollegeHumorIE(InfoExtractor):
730 """Information extractor for collegehumor.com"""
733 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
734 IE_NAME = u'collegehumor'
736 def report_manifest(self, video_id):
737 """Report information extraction."""
738 self.to_screen(u'%s: Downloading XML manifest' % video_id)
740 def _real_extract(self, url):
741 mobj = re.match(self._VALID_URL, url)
743 raise ExtractorError(u'Invalid URL: %s' % url)
744 video_id = mobj.group('videoid')
752 self.report_extraction(video_id)
753 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
755 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
757 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
759 mdoc = xml.etree.ElementTree.fromstring(metaXml)
761 videoNode = mdoc.findall('./video')[0]
762 info['description'] = videoNode.findall('./description')[0].text
763 info['title'] = videoNode.findall('./caption')[0].text
764 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
765 manifest_url = videoNode.findall('./file')[0].text
767 raise ExtractorError(u'Invalid metadata XML file')
769 manifest_url += '?hdcore=2.10.3'
770 self.report_manifest(video_id)
772 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
773 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
774 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
776 adoc = xml.etree.ElementTree.fromstring(manifestXml)
778 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
779 node_id = media_node.attrib['url']
780 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
781 except IndexError as err:
782 raise ExtractorError(u'Invalid manifest file')
784 url_pr = compat_urllib_parse_urlparse(manifest_url)
785 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
792 class XVideosIE(InfoExtractor):
793 """Information extractor for xvideos.com"""
795 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
798 def _real_extract(self, url):
799 mobj = re.match(self._VALID_URL, url)
801 raise ExtractorError(u'Invalid URL: %s' % url)
802 video_id = mobj.group(1)
804 webpage = self._download_webpage(url, video_id)
806 self.report_extraction(video_id)
809 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
810 webpage, u'video URL'))
813 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
816 # Extract video thumbnail
817 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
818 webpage, u'thumbnail', fatal=False)
825 'title': video_title,
827 'thumbnail': video_thumbnail,
834 class SoundcloudIE(InfoExtractor):
835 """Information extractor for soundcloud.com
836 To access the media, the uid of the song and a stream token
837 must be extracted from the page source and the script must make
838 a request to media.soundcloud.com/crossdomain.xml. Then
839 the media can be grabbed by requesting from an url composed
840 of the stream token and uid
843 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
844 IE_NAME = u'soundcloud'
846 def report_resolve(self, video_id):
847 """Report information extraction."""
848 self.to_screen(u'%s: Resolving id' % video_id)
850 def _real_extract(self, url):
851 mobj = re.match(self._VALID_URL, url)
853 raise ExtractorError(u'Invalid URL: %s' % url)
855 # extract uploader (which is in the url)
856 uploader = mobj.group(1)
857 # extract simple title (uploader + slug of song title)
858 slug_title = mobj.group(2)
859 simple_title = uploader + u'-' + slug_title
860 full_title = '%s/%s' % (uploader, slug_title)
862 self.report_resolve(full_title)
864 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
865 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
866 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
868 info = json.loads(info_json)
869 video_id = info['id']
870 self.report_extraction(full_title)
872 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
873 stream_json = self._download_webpage(streams_url, full_title,
874 u'Downloading stream definitions',
875 u'unable to download stream definitions')
877 streams = json.loads(stream_json)
878 mediaURL = streams['http_mp3_128_url']
879 upload_date = unified_strdate(info['created_at'])
884 'uploader': info['user']['username'],
885 'upload_date': upload_date,
886 'title': info['title'],
888 'description': info['description'],
891 class SoundcloudSetIE(InfoExtractor):
892 """Information extractor for soundcloud.com sets
893 To access the media, the uid of the song and a stream token
894 must be extracted from the page source and the script must make
895 a request to media.soundcloud.com/crossdomain.xml. Then
896 the media can be grabbed by requesting from an url composed
897 of the stream token and uid
900 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
901 IE_NAME = u'soundcloud:set'
903 def report_resolve(self, video_id):
904 """Report information extraction."""
905 self.to_screen(u'%s: Resolving id' % video_id)
907 def _real_extract(self, url):
908 mobj = re.match(self._VALID_URL, url)
910 raise ExtractorError(u'Invalid URL: %s' % url)
912 # extract uploader (which is in the url)
913 uploader = mobj.group(1)
914 # extract simple title (uploader + slug of song title)
915 slug_title = mobj.group(2)
916 simple_title = uploader + u'-' + slug_title
917 full_title = '%s/sets/%s' % (uploader, slug_title)
919 self.report_resolve(full_title)
921 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
922 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
923 info_json = self._download_webpage(resolv_url, full_title)
926 info = json.loads(info_json)
928 for err in info['errors']:
929 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
932 self.report_extraction(full_title)
933 for track in info['tracks']:
934 video_id = track['id']
936 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
937 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
939 self.report_extraction(video_id)
940 streams = json.loads(stream_json)
941 mediaURL = streams['http_mp3_128_url']
946 'uploader': track['user']['username'],
947 'upload_date': unified_strdate(track['created_at']),
948 'title': track['title'],
950 'description': track['description'],
955 class InfoQIE(InfoExtractor):
956 """Information extractor for infoq.com"""
957 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
959 def _real_extract(self, url):
960 mobj = re.match(self._VALID_URL, url)
962 raise ExtractorError(u'Invalid URL: %s' % url)
964 webpage = self._download_webpage(url, video_id=url)
965 self.report_extraction(url)
968 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
970 raise ExtractorError(u'Unable to extract video url')
971 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
972 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
975 video_title = self._search_regex(r'contentTitle = "(.*?)";',
978 # Extract description
979 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
980 webpage, u'description', fatal=False)
982 video_filename = video_url.split('/')[-1]
983 video_id, extension = video_filename.split('.')
990 'title': video_title,
991 'ext': extension, # Extension is always(?) mp4, but seems to be flv
993 'description': video_description,
998 class MixcloudIE(InfoExtractor):
999 """Information extractor for www.mixcloud.com"""
1001 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1002 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1003 IE_NAME = u'mixcloud'
1005 def report_download_json(self, file_id):
1006 """Report JSON download."""
1007 self.to_screen(u'Downloading json')
1009 def get_urls(self, jsonData, fmt, bitrate='best'):
1010 """Get urls from 'audio_formats' section in json"""
1013 bitrate_list = jsonData[fmt]
1014 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1015 bitrate = max(bitrate_list) # select highest
1017 url_list = jsonData[fmt][bitrate]
1018 except TypeError: # we have no bitrate info.
1019 url_list = jsonData[fmt]
1022 def check_urls(self, url_list):
1023 """Returns 1st active url from list"""
1024 for url in url_list:
1026 compat_urllib_request.urlopen(url)
1028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033 def _print_formats(self, formats):
1034 print('Available formats:')
1035 for fmt in formats.keys():
1036 for b in formats[fmt]:
1038 ext = formats[fmt][b][0]
1039 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1040 except TypeError: # we have no bitrate info
1041 ext = formats[fmt][0]
1042 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1045 def _real_extract(self, url):
1046 mobj = re.match(self._VALID_URL, url)
1048 raise ExtractorError(u'Invalid URL: %s' % url)
1049 # extract uploader & filename from url
1050 uploader = mobj.group(1).decode('utf-8')
1051 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1053 # construct API request
1054 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1055 # retrieve .json file with links to files
1056 request = compat_urllib_request.Request(file_url)
1058 self.report_download_json(file_url)
1059 jsonData = compat_urllib_request.urlopen(request).read()
1060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1061 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1064 json_data = json.loads(jsonData)
1065 player_url = json_data['player_swf_url']
1066 formats = dict(json_data['audio_formats'])
1068 req_format = self._downloader.params.get('format', None)
1071 if self._downloader.params.get('listformats', None):
1072 self._print_formats(formats)
1075 if req_format is None or req_format == 'best':
1076 for format_param in formats.keys():
1077 url_list = self.get_urls(formats, format_param)
1079 file_url = self.check_urls(url_list)
1080 if file_url is not None:
1083 if req_format not in formats:
1084 raise ExtractorError(u'Format is not available')
1086 url_list = self.get_urls(formats, req_format)
1087 file_url = self.check_urls(url_list)
1088 format_param = req_format
1091 'id': file_id.decode('utf-8'),
1092 'url': file_url.decode('utf-8'),
1093 'uploader': uploader.decode('utf-8'),
1094 'upload_date': None,
1095 'title': json_data['name'],
1096 'ext': file_url.split('.')[-1].decode('utf-8'),
1097 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1098 'thumbnail': json_data['thumbnail_url'],
1099 'description': json_data['description'],
1100 'player_url': player_url.decode('utf-8'),
1103 class StanfordOpenClassroomIE(InfoExtractor):
1104 """Information extractor for Stanford's Open ClassRoom"""
1106 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1107 IE_NAME = u'stanfordoc'
1109 def _real_extract(self, url):
1110 mobj = re.match(self._VALID_URL, url)
1112 raise ExtractorError(u'Invalid URL: %s' % url)
1114 if mobj.group('course') and mobj.group('video'): # A specific video
1115 course = mobj.group('course')
1116 video = mobj.group('video')
1118 'id': course + '_' + video,
1120 'upload_date': None,
1123 self.report_extraction(info['id'])
1124 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1125 xmlUrl = baseUrl + video + '.xml'
1127 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1128 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1130 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1132 info['title'] = mdoc.findall('./title')[0].text
1133 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1135 raise ExtractorError(u'Invalid metadata XML file')
1136 info['ext'] = info['url'].rpartition('.')[2]
1138 elif mobj.group('course'): # A course page
1139 course = mobj.group('course')
1144 'upload_date': None,
1147 coursepage = self._download_webpage(url, info['id'],
1148 note='Downloading course info page',
1149 errnote='Unable to download course info page')
1151 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1153 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1154 coursepage, u'description', fatal=False)
1156 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1159 'type': 'reference',
1160 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1164 for entry in info['list']:
1165 assert entry['type'] == 'reference'
1166 results += self.extract(entry['url'])
1170 'id': 'Stanford OpenClassroom',
1173 'upload_date': None,
1176 self.report_download_webpage(info['id'])
1177 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1179 rootpage = compat_urllib_request.urlopen(rootURL).read()
1180 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1181 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1183 info['title'] = info['id']
1185 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1188 'type': 'reference',
1189 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1194 for entry in info['list']:
1195 assert entry['type'] == 'reference'
1196 results += self.extract(entry['url'])
1199 class MTVIE(InfoExtractor):
1200 """Information extractor for MTV.com"""
1202 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1205 def _real_extract(self, url):
1206 mobj = re.match(self._VALID_URL, url)
1208 raise ExtractorError(u'Invalid URL: %s' % url)
1209 if not mobj.group('proto'):
1210 url = 'http://' + url
1211 video_id = mobj.group('videoid')
1213 webpage = self._download_webpage(url, video_id)
1215 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1216 webpage, u'song name', fatal=False)
1218 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1221 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1222 webpage, u'mtvn_uri', fatal=False)
1224 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1225 webpage, u'content id', fatal=False)
1227 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1228 self.report_extraction(video_id)
1229 request = compat_urllib_request.Request(videogen_url)
1231 metadataXml = compat_urllib_request.urlopen(request).read()
1232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1233 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1235 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1236 renditions = mdoc.findall('.//rendition')
1238 # For now, always pick the highest quality.
1239 rendition = renditions[-1]
1242 _,_,ext = rendition.attrib['type'].partition('/')
1243 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1244 video_url = rendition.find('./src').text
1246 raise ExtractorError('Invalid rendition field.')
1251 'uploader': performer,
1252 'upload_date': None,
1253 'title': video_title,
1261 class YoukuIE(InfoExtractor):
1262 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1265 nowTime = int(time.time() * 1000)
1266 random1 = random.randint(1000,1998)
1267 random2 = random.randint(1000,9999)
1269 return "%d%d%d" %(nowTime,random1,random2)
1271 def _get_file_ID_mix_string(self, seed):
1273 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1275 for i in range(len(source)):
1276 seed = (seed * 211 + 30031 ) % 65536
1277 index = math.floor(seed / 65536 * len(source) )
1278 mixed.append(source[int(index)])
1279 source.remove(source[int(index)])
1280 #return ''.join(mixed)
1283 def _get_file_id(self, fileId, seed):
1284 mixed = self._get_file_ID_mix_string(seed)
1285 ids = fileId.split('*')
1289 realId.append(mixed[int(ch)])
1290 return ''.join(realId)
1292 def _real_extract(self, url):
1293 mobj = re.match(self._VALID_URL, url)
1295 raise ExtractorError(u'Invalid URL: %s' % url)
1296 video_id = mobj.group('ID')
1298 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1300 jsondata = self._download_webpage(info_url, video_id)
1302 self.report_extraction(video_id)
1304 config = json.loads(jsondata)
1306 video_title = config['data'][0]['title']
1307 seed = config['data'][0]['seed']
1309 format = self._downloader.params.get('format', None)
1310 supported_format = list(config['data'][0]['streamfileids'].keys())
1312 if format is None or format == 'best':
1313 if 'hd2' in supported_format:
1318 elif format == 'worst':
1326 fileid = config['data'][0]['streamfileids'][format]
1327 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1328 except (UnicodeDecodeError, ValueError, KeyError):
1329 raise ExtractorError(u'Unable to extract info section')
1332 sid = self._gen_sid()
1333 fileid = self._get_file_id(fileid, seed)
1335 #column 8,9 of fileid represent the segment number
1336 #fileid[7:9] should be changed
1337 for index, key in enumerate(keys):
1339 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1340 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1343 'id': '%s_part%02d' % (video_id, index),
1344 'url': download_url,
1346 'upload_date': None,
1347 'title': video_title,
1350 files_info.append(info)
1355 class XNXXIE(InfoExtractor):
1356 """Information extractor for xnxx.com"""
1358 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1360 VIDEO_URL_RE = r'flv_url=(.*?)&'
1361 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1362 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1364 def _real_extract(self, url):
1365 mobj = re.match(self._VALID_URL, url)
1367 raise ExtractorError(u'Invalid URL: %s' % url)
1368 video_id = mobj.group(1)
1370 # Get webpage content
1371 webpage = self._download_webpage(url, video_id)
1373 video_url = self._search_regex(self.VIDEO_URL_RE,
1374 webpage, u'video URL')
1375 video_url = compat_urllib_parse.unquote(video_url)
1377 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1380 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1381 webpage, u'thumbnail', fatal=False)
1387 'upload_date': None,
1388 'title': video_title,
1390 'thumbnail': video_thumbnail,
1391 'description': None,
1395 class GooglePlusIE(InfoExtractor):
1396 """Information extractor for plus.google.com."""
1398 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1399 IE_NAME = u'plus.google'
1401 def _real_extract(self, url):
1402 # Extract id from URL
1403 mobj = re.match(self._VALID_URL, url)
1405 raise ExtractorError(u'Invalid URL: %s' % url)
1407 post_url = mobj.group(0)
1408 video_id = mobj.group(1)
1410 video_extension = 'flv'
1412 # Step 1, Retrieve post webpage to extract further information
1413 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1415 self.report_extraction(video_id)
1417 # Extract update date
1418 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1419 webpage, u'upload date', fatal=False)
1421 # Convert timestring to a format suitable for filename
1422 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1423 upload_date = upload_date.strftime('%Y%m%d')
1426 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1427 webpage, u'uploader', fatal=False)
1430 # Get the first line for title
1431 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1432 webpage, 'title', default=u'NA')
1434 # Step 2, Stimulate clicking the image box to launch video
1435 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1436 webpage, u'video page URL')
1437 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1439 # Extract video links on video page
1440 """Extract video links of all sizes"""
1441 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1442 mobj = re.findall(pattern, webpage)
1444 raise ExtractorError(u'Unable to extract video links')
1446 # Sort in resolution
1447 links = sorted(mobj)
1449 # Choose the lowest of the sort, i.e. highest resolution
1450 video_url = links[-1]
1451 # Only get the url. The resolution part in the tuple has no use anymore
1452 video_url = video_url[-1]
1453 # Treat escaped \u0026 style hex
1455 video_url = video_url.decode("unicode_escape")
1456 except AttributeError: # Python 3
1457 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1463 'uploader': uploader,
1464 'upload_date': upload_date,
1465 'title': video_title,
1466 'ext': video_extension,
1469 class NBAIE(InfoExtractor):
1470 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1473 def _real_extract(self, url):
1474 mobj = re.match(self._VALID_URL, url)
1476 raise ExtractorError(u'Invalid URL: %s' % url)
1478 video_id = mobj.group(1)
1480 webpage = self._download_webpage(url, video_id)
1482 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1484 shortened_video_id = video_id.rpartition('/')[2]
1485 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1486 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1488 # It isn't there in the HTML it returns to us
1489 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1491 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1494 'id': shortened_video_id,
1498 # 'uploader_date': uploader_date,
1499 'description': description,
1503 class JustinTVIE(InfoExtractor):
1504 """Information extractor for justin.tv and twitch.tv"""
1505 # TODO: One broadcast may be split into multiple videos. The key
1506 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1507 # starts at 1 and increases. Can we treat all parts as one video?
1509 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1511 (?P<channelid>[^/]+)|
1512 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1513 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1517 _JUSTIN_PAGE_LIMIT = 100
1518 IE_NAME = u'justin.tv'
1520 def report_download_page(self, channel, offset):
1521 """Report attempt to download a single page of videos."""
1522 self.to_screen(u'%s: Downloading video information from %d to %d' %
1523 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1525 # Return count of items, list of *valid* items
1526 def _parse_page(self, url, video_id):
1527 webpage = self._download_webpage(url, video_id,
1528 u'Downloading video info JSON',
1529 u'unable to download video info JSON')
1531 response = json.loads(webpage)
1532 if type(response) != list:
1533 error_text = response.get('error', 'unknown error')
1534 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1536 for clip in response:
1537 video_url = clip['video_file_url']
1539 video_extension = os.path.splitext(video_url)[1][1:]
1540 video_date = re.sub('-', '', clip['start_time'][:10])
1541 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1542 video_id = clip['id']
1543 video_title = clip.get('title', video_id)
1547 'title': video_title,
1548 'uploader': clip.get('channel_name', video_uploader_id),
1549 'uploader_id': video_uploader_id,
1550 'upload_date': video_date,
1551 'ext': video_extension,
1553 return (len(response), info)
1555 def _real_extract(self, url):
1556 mobj = re.match(self._VALID_URL, url)
1558 raise ExtractorError(u'invalid URL: %s' % url)
1560 api_base = 'http://api.justin.tv'
1562 if mobj.group('channelid'):
1564 video_id = mobj.group('channelid')
1565 api = api_base + '/channel/archives/%s.json' % video_id
1566 elif mobj.group('chapterid'):
1567 chapter_id = mobj.group('chapterid')
1569 webpage = self._download_webpage(url, chapter_id)
1570 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1572 raise ExtractorError(u'Cannot find archive of a chapter')
1573 archive_id = m.group(1)
1575 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1576 chapter_info_xml = self._download_webpage(api, chapter_id,
1577 note=u'Downloading chapter information',
1578 errnote=u'Chapter information download failed')
1579 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1580 for a in doc.findall('.//archive'):
1581 if archive_id == a.find('./id').text:
1584 raise ExtractorError(u'Could not find chapter in chapter information')
1586 video_url = a.find('./video_file_url').text
1587 video_ext = video_url.rpartition('.')[2] or u'flv'
1589 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1590 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1591 note='Downloading chapter metadata',
1592 errnote='Download of chapter metadata failed')
1593 chapter_info = json.loads(chapter_info_json)
1595 bracket_start = int(doc.find('.//bracket_start').text)
1596 bracket_end = int(doc.find('.//bracket_end').text)
1598 # TODO determine start (and probably fix up file)
1599 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1600 #video_url += u'?start=' + TODO:start_timestamp
1601 # bracket_start is 13290, but we want 51670615
1602 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1603 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1606 'id': u'c' + chapter_id,
1609 'title': chapter_info['title'],
1610 'thumbnail': chapter_info['preview'],
1611 'description': chapter_info['description'],
1612 'uploader': chapter_info['channel']['display_name'],
1613 'uploader_id': chapter_info['channel']['name'],
1617 video_id = mobj.group('videoid')
1618 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1620 self.report_extraction(video_id)
1624 limit = self._JUSTIN_PAGE_LIMIT
1627 self.report_download_page(video_id, offset)
1628 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1629 page_count, page_info = self._parse_page(page_url, video_id)
1630 info.extend(page_info)
1631 if not paged or page_count != limit:
1636 class FunnyOrDieIE(InfoExtractor):
1637 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1639 def _real_extract(self, url):
1640 mobj = re.match(self._VALID_URL, url)
1642 raise ExtractorError(u'invalid URL: %s' % url)
1644 video_id = mobj.group('id')
1645 webpage = self._download_webpage(url, video_id)
1647 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1648 webpage, u'video URL', flags=re.DOTALL)
1650 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1651 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1653 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1654 webpage, u'description', fatal=False, flags=re.DOTALL)
1661 'description': video_description,
1665 class SteamIE(InfoExtractor):
1666 _VALID_URL = r"""http://store\.steampowered\.com/
1668 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1670 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1672 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1673 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1676 def suitable(cls, url):
1677 """Receives a URL and returns True if suitable for this IE."""
1678 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1680 def _real_extract(self, url):
1681 m = re.match(self._VALID_URL, url, re.VERBOSE)
1682 gameID = m.group('gameID')
1684 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1685 webpage = self._download_webpage(videourl, gameID)
1687 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1688 videourl = self._AGECHECK_TEMPLATE % gameID
1689 self.report_age_confirmation()
1690 webpage = self._download_webpage(videourl, gameID)
1692 self.report_extraction(gameID)
1693 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1694 webpage, 'game title')
1696 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1697 mweb = re.finditer(urlRE, webpage)
1698 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1699 titles = re.finditer(namesRE, webpage)
1700 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1701 thumbs = re.finditer(thumbsRE, webpage)
1703 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1704 video_id = vid.group('videoID')
1705 title = vtitle.group('videoName')
1706 video_url = vid.group('videoURL')
1707 video_thumb = thumb.group('thumbnail')
1709 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1714 'title': unescapeHTML(title),
1715 'thumbnail': video_thumb
1718 return [self.playlist_result(videos, gameID, game_title)]
1720 class UstreamIE(InfoExtractor):
1721 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1722 IE_NAME = u'ustream'
1724 def _real_extract(self, url):
1725 m = re.match(self._VALID_URL, url)
1726 video_id = m.group('videoID')
1728 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1729 webpage = self._download_webpage(url, video_id)
1731 self.report_extraction(video_id)
1733 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1736 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1737 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1739 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1740 webpage, u'thumbnail', fatal=False)
1746 'title': video_title,
1747 'uploader': uploader,
1748 'thumbnail': thumbnail,
1752 class WorldStarHipHopIE(InfoExtractor):
1753 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1754 IE_NAME = u'WorldStarHipHop'
1756 def _real_extract(self, url):
1757 m = re.match(self._VALID_URL, url)
1758 video_id = m.group('id')
1760 webpage_src = self._download_webpage(url, video_id)
1762 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1763 webpage_src, u'video URL')
1765 if 'mp4' in video_url:
1770 video_title = self._html_search_regex(r"<title>(.*)</title>",
1771 webpage_src, u'title')
1773 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1774 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1775 webpage_src, u'thumbnail', fatal=False)
1778 _title = r"""candytitles.*>(.*)</span>"""
1779 mobj = re.search(_title, webpage_src)
1780 if mobj is not None:
1781 video_title = mobj.group(1)
1786 'title' : video_title,
1787 'thumbnail' : thumbnail,
1792 class RBMARadioIE(InfoExtractor):
1793 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1795 def _real_extract(self, url):
1796 m = re.match(self._VALID_URL, url)
1797 video_id = m.group('videoID')
1799 webpage = self._download_webpage(url, video_id)
1801 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1802 webpage, u'json data', flags=re.MULTILINE)
1805 data = json.loads(json_data)
1806 except ValueError as e:
1807 raise ExtractorError(u'Invalid JSON: ' + str(e))
1809 video_url = data['akamai_url'] + '&cbr=256'
1810 url_parts = compat_urllib_parse_urlparse(video_url)
1811 video_ext = url_parts.path.rpartition('.')[2]
1816 'title': data['title'],
1817 'description': data.get('teaser_text'),
1818 'location': data.get('country_of_origin'),
1819 'uploader': data.get('host', {}).get('name'),
1820 'uploader_id': data.get('host', {}).get('slug'),
1821 'thumbnail': data.get('image', {}).get('large_url_2x'),
1822 'duration': data.get('duration'),
1827 class YouPornIE(InfoExtractor):
1828 """Information extractor for youporn.com."""
1829 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1831 def _print_formats(self, formats):
1832 """Print all available formats"""
1833 print(u'Available formats:')
1834 print(u'ext\t\tformat')
1835 print(u'---------------------------------')
1836 for format in formats:
1837 print(u'%s\t\t%s' % (format['ext'], format['format']))
1839 def _specific(self, req_format, formats):
1841 if(x["format"]==req_format):
1845 def _real_extract(self, url):
1846 mobj = re.match(self._VALID_URL, url)
1848 raise ExtractorError(u'Invalid URL: %s' % url)
1849 video_id = mobj.group('videoid')
1851 req = compat_urllib_request.Request(url)
1852 req.add_header('Cookie', 'age_verified=1')
1853 webpage = self._download_webpage(req, video_id)
1855 # Get JSON parameters
1856 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1858 params = json.loads(json_params)
1860 raise ExtractorError(u'Invalid JSON')
1862 self.report_extraction(video_id)
1864 video_title = params['title']
1865 upload_date = unified_strdate(params['release_date_f'])
1866 video_description = params['description']
1867 video_uploader = params['submitted_by']
1868 thumbnail = params['thumbnails'][0]['image']
1870 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1872 # Get all of the formats available
1873 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1874 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1875 webpage, u'download list').strip()
1877 # Get all of the links from the page
1878 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1879 links = re.findall(LINK_RE, download_list_html)
1880 if(len(links) == 0):
1881 raise ExtractorError(u'ERROR: no known formats available for video')
1883 self.to_screen(u'Links found: %d' % len(links))
1888 # A link looks like this:
1889 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1890 # A path looks like this:
1891 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1892 video_url = unescapeHTML( link )
1893 path = compat_urllib_parse_urlparse( video_url ).path
1894 extension = os.path.splitext( path )[1][1:]
1895 format = path.split('/')[4].split('_')[:2]
1898 format = "-".join( format )
1899 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1904 'uploader': video_uploader,
1905 'upload_date': upload_date,
1906 'title': video_title,
1909 'thumbnail': thumbnail,
1910 'description': video_description
1913 if self._downloader.params.get('listformats', None):
1914 self._print_formats(formats)
1917 req_format = self._downloader.params.get('format', None)
1918 self.to_screen(u'Format: %s' % req_format)
1920 if req_format is None or req_format == 'best':
1922 elif req_format == 'worst':
1923 return [formats[-1]]
1924 elif req_format in ('-1', 'all'):
1927 format = self._specific( req_format, formats )
1929 raise ExtractorError(u'Requested format not available')
1934 class PornotubeIE(InfoExtractor):
1935 """Information extractor for pornotube.com."""
1936 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1938 def _real_extract(self, url):
1939 mobj = re.match(self._VALID_URL, url)
1941 raise ExtractorError(u'Invalid URL: %s' % url)
1943 video_id = mobj.group('videoid')
1944 video_title = mobj.group('title')
1946 # Get webpage content
1947 webpage = self._download_webpage(url, video_id)
1950 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1951 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1952 video_url = compat_urllib_parse.unquote(video_url)
1954 #Get the uploaded date
1955 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1956 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1957 if upload_date: upload_date = unified_strdate(upload_date)
1959 info = {'id': video_id,
1962 'upload_date': upload_date,
1963 'title': video_title,
1969 class YouJizzIE(InfoExtractor):
1970 """Information extractor for youjizz.com."""
1971 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1973 def _real_extract(self, url):
1974 mobj = re.match(self._VALID_URL, url)
1976 raise ExtractorError(u'Invalid URL: %s' % url)
1978 video_id = mobj.group('videoid')
1980 # Get webpage content
1981 webpage = self._download_webpage(url, video_id)
1983 # Get the video title
1984 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1985 webpage, u'title').strip()
1987 # Get the embed page
1988 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1990 raise ExtractorError(u'ERROR: unable to extract embed page')
1992 embed_page_url = result.group(0).strip()
1993 video_id = result.group('videoid')
1995 webpage = self._download_webpage(embed_page_url, video_id)
1998 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1999 webpage, u'video URL')
2001 info = {'id': video_id,
2003 'title': video_title,
2006 'player_url': embed_page_url}
2010 class EightTracksIE(InfoExtractor):
2012 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2014 def _real_extract(self, url):
2015 mobj = re.match(self._VALID_URL, url)
2017 raise ExtractorError(u'Invalid URL: %s' % url)
2018 playlist_id = mobj.group('id')
2020 webpage = self._download_webpage(url, playlist_id)
2022 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2023 data = json.loads(json_like)
2025 session = str(random.randint(0, 1000000000))
2027 track_count = data['tracks_count']
2028 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2029 next_url = first_url
2031 for i in itertools.count():
2032 api_json = self._download_webpage(next_url, playlist_id,
2033 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2034 errnote=u'Failed to download song information')
2035 api_data = json.loads(api_json)
2036 track_data = api_data[u'set']['track']
2038 'id': track_data['id'],
2039 'url': track_data['track_file_stream_url'],
2040 'title': track_data['performer'] + u' - ' + track_data['name'],
2041 'raw_title': track_data['name'],
2042 'uploader_id': data['user']['login'],
2046 if api_data['set']['at_last_track']:
2048 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2051 class KeekIE(InfoExtractor):
2052 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2055 def _real_extract(self, url):
2056 m = re.match(self._VALID_URL, url)
2057 video_id = m.group('videoID')
2059 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2060 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2061 webpage = self._download_webpage(url, video_id)
2063 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2066 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2067 webpage, u'uploader', fatal=False)
2073 'title': video_title,
2074 'thumbnail': thumbnail,
2075 'uploader': uploader
2079 class TEDIE(InfoExtractor):
2080 _VALID_URL=r'''http://www\.ted\.com/
2082 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2084 ((?P<type_talk>talks)) # We have a simple talk
2086 (/lang/(.*?))? # The url may contain the language
2087 /(?P<name>\w+) # Here goes the name and then ".html"
2091 def suitable(cls, url):
2092 """Receives a URL and returns True if suitable for this IE."""
2093 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2095 def _real_extract(self, url):
2096 m=re.match(self._VALID_URL, url, re.VERBOSE)
2097 if m.group('type_talk'):
2098 return [self._talk_info(url)]
2100 playlist_id=m.group('playlist_id')
2101 name=m.group('name')
2102 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2103 return [self._playlist_videos_info(url,name,playlist_id)]
2105 def _playlist_videos_info(self,url,name,playlist_id=0):
2106 '''Returns the videos of the playlist'''
2108 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2109 ([.\s]*?)data-playlist_item_id="(\d+)"
2110 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2112 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2113 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2114 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2115 m_names=re.finditer(video_name_RE,webpage)
2117 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2118 webpage, 'playlist title')
2120 playlist_entries = []
2121 for m_video, m_name in zip(m_videos,m_names):
2122 video_id=m_video.group('video_id')
2123 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2124 playlist_entries.append(self.url_result(talk_url, 'TED'))
2125 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2127 def _talk_info(self, url, video_id=0):
2128 """Return the video for the talk in the url"""
2129 m = re.match(self._VALID_URL, url,re.VERBOSE)
2130 video_name = m.group('name')
2131 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2132 self.report_extraction(video_name)
2133 # If the url includes the language we get the title translated
2134 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2136 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2137 webpage, 'json data')
2138 info = json.loads(json_data)
2139 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2140 webpage, 'description', flags = re.DOTALL)
2142 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2143 webpage, 'thumbnail')
2146 'url': info['htmlStreams'][-1]['file'],
2149 'thumbnail': thumbnail,
2150 'description': desc,
2154 class MySpassIE(InfoExtractor):
2155 _VALID_URL = r'http://www.myspass.de/.*'
2157 def _real_extract(self, url):
2158 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2160 # video id is the last path element of the URL
2161 # usually there is a trailing slash, so also try the second but last
2162 url_path = compat_urllib_parse_urlparse(url).path
2163 url_parent_path, video_id = os.path.split(url_path)
2165 _, video_id = os.path.split(url_parent_path)
2168 metadata_url = META_DATA_URL_TEMPLATE % video_id
2169 metadata_text = self._download_webpage(metadata_url, video_id)
2170 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2172 # extract values from metadata
2173 url_flv_el = metadata.find('url_flv')
2174 if url_flv_el is None:
2175 raise ExtractorError(u'Unable to extract download url')
2176 video_url = url_flv_el.text
2177 extension = os.path.splitext(video_url)[1][1:]
2178 title_el = metadata.find('title')
2179 if title_el is None:
2180 raise ExtractorError(u'Unable to extract title')
2181 title = title_el.text
2182 format_id_el = metadata.find('format_id')
2183 if format_id_el is None:
2186 format = format_id_el.text
2187 description_el = metadata.find('description')
2188 if description_el is not None:
2189 description = description_el.text
2192 imagePreview_el = metadata.find('imagePreview')
2193 if imagePreview_el is not None:
2194 thumbnail = imagePreview_el.text
2203 'thumbnail': thumbnail,
2204 'description': description
2208 class SpiegelIE(InfoExtractor):
2209 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2211 def _real_extract(self, url):
2212 m = re.match(self._VALID_URL, url)
2213 video_id = m.group('videoID')
2215 webpage = self._download_webpage(url, video_id)
2217 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2220 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2221 xml_code = self._download_webpage(xml_url, video_id,
2222 note=u'Downloading XML', errnote=u'Failed to download XML')
2224 idoc = xml.etree.ElementTree.fromstring(xml_code)
2225 last_type = idoc[-1]
2226 filename = last_type.findall('./filename')[0].text
2227 duration = float(last_type.findall('./duration')[0].text)
2229 video_url = 'http://video2.spiegel.de/flash/' + filename
2230 video_ext = filename.rpartition('.')[2]
2235 'title': video_title,
2236 'duration': duration,
2240 class LiveLeakIE(InfoExtractor):
2242 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2243 IE_NAME = u'liveleak'
2245 def _real_extract(self, url):
2246 mobj = re.match(self._VALID_URL, url)
2248 raise ExtractorError(u'Invalid URL: %s' % url)
2250 video_id = mobj.group('video_id')
2252 webpage = self._download_webpage(url, video_id)
2254 video_url = self._search_regex(r'file: "(.*?)",',
2255 webpage, u'video URL')
2257 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2258 webpage, u'title').replace('LiveLeak.com -', '').strip()
2260 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2261 webpage, u'description', fatal=False)
2263 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2264 webpage, u'uploader', fatal=False)
2270 'title': video_title,
2271 'description': video_description,
2272 'uploader': video_uploader
2279 class TumblrIE(InfoExtractor):
2280 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2282 def _real_extract(self, url):
2283 m_url = re.match(self._VALID_URL, url)
2284 video_id = m_url.group('id')
2285 blog = m_url.group('blog_name')
2287 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2288 webpage = self._download_webpage(url, video_id)
2290 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2291 video = re.search(re_video, webpage)
2293 raise ExtractorError(u'Unable to extract video')
2294 video_url = video.group('video_url')
2295 ext = video.group('ext')
2297 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2298 webpage, u'thumbnail', fatal=False) # We pick the first poster
2299 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2301 # The only place where you can get a title, it's not complete,
2302 # but searching in other places doesn't work for all videos
2303 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2304 webpage, u'title', flags=re.DOTALL)
2306 return [{'id': video_id,
2308 'title': video_title,
2309 'thumbnail': video_thumbnail,
2313 class BandcampIE(InfoExtractor):
2314 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2316 def _real_extract(self, url):
2317 mobj = re.match(self._VALID_URL, url)
2318 title = mobj.group('title')
2319 webpage = self._download_webpage(url, title)
2320 # We get the link to the free download page
2321 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2322 if m_download is None:
2323 raise ExtractorError(u'No free songs found')
2325 download_link = m_download.group(1)
2326 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2327 webpage, re.MULTILINE|re.DOTALL).group('id')
2329 download_webpage = self._download_webpage(download_link, id,
2330 'Downloading free downloads page')
2331 # We get the dictionary of the track from some javascrip code
2332 info = re.search(r'items: (.*?),$',
2333 download_webpage, re.MULTILINE).group(1)
2334 info = json.loads(info)[0]
2335 # We pick mp3-320 for now, until format selection can be easily implemented.
2336 mp3_info = info[u'downloads'][u'mp3-320']
2337 # If we try to use this url it says the link has expired
2338 initial_url = mp3_info[u'url']
2339 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2340 m_url = re.match(re_url, initial_url)
2341 #We build the url we will use to get the final track url
2342 # This url is build in Bandcamp in the script download_bunde_*.js
2343 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2344 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2345 # If we could correctly generate the .rand field the url would be
2346 #in the "download_url" key
2347 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2349 track_info = {'id':id,
2350 'title' : info[u'title'],
2353 'thumbnail' : info[u'thumb_url'],
2354 'uploader' : info[u'artist']
2359 class RedTubeIE(InfoExtractor):
2360 """Information Extractor for redtube"""
2361 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2363 def _real_extract(self,url):
2364 mobj = re.match(self._VALID_URL, url)
2366 raise ExtractorError(u'Invalid URL: %s' % url)
2368 video_id = mobj.group('id')
2369 video_extension = 'mp4'
2370 webpage = self._download_webpage(url, video_id)
2372 self.report_extraction(video_id)
2374 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2375 webpage, u'video URL')
2377 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2383 'ext': video_extension,
2384 'title': video_title,
2387 class InaIE(InfoExtractor):
2388 """Information Extractor for Ina.fr"""
2389 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2391 def _real_extract(self,url):
2392 mobj = re.match(self._VALID_URL, url)
2394 video_id = mobj.group('id')
2395 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2396 video_extension = 'mp4'
2397 webpage = self._download_webpage(mrss_url, video_id)
2399 self.report_extraction(video_id)
2401 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2402 webpage, u'video URL')
2404 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2410 'ext': video_extension,
2411 'title': video_title,
2414 class HowcastIE(InfoExtractor):
2415 """Information Extractor for Howcast.com"""
2416 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2418 def _real_extract(self, url):
2419 mobj = re.match(self._VALID_URL, url)
2421 video_id = mobj.group('id')
2422 webpage_url = 'http://www.howcast.com/videos/' + video_id
2423 webpage = self._download_webpage(webpage_url, video_id)
2425 self.report_extraction(video_id)
2427 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2428 webpage, u'video URL')
2430 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2433 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2434 webpage, u'description', fatal=False)
2436 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2437 webpage, u'thumbnail', fatal=False)
2443 'title': video_title,
2444 'description': video_description,
2445 'thumbnail': thumbnail,
2448 class VineIE(InfoExtractor):
2449 """Information Extractor for Vine.co"""
2450 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2452 def _real_extract(self, url):
2453 mobj = re.match(self._VALID_URL, url)
2455 video_id = mobj.group('id')
2456 webpage_url = 'https://vine.co/v/' + video_id
2457 webpage = self._download_webpage(webpage_url, video_id)
2459 self.report_extraction(video_id)
2461 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2462 webpage, u'video URL')
2464 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2467 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2468 webpage, u'thumbnail', fatal=False)
2470 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2471 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2477 'title': video_title,
2478 'thumbnail': thumbnail,
2479 'uploader': uploader,
2482 class FlickrIE(InfoExtractor):
2483 """Information Extractor for Flickr videos"""
2484 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2486 def _real_extract(self, url):
2487 mobj = re.match(self._VALID_URL, url)
2489 video_id = mobj.group('id')
2490 video_uploader_id = mobj.group('uploader_id')
2491 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2492 webpage = self._download_webpage(webpage_url, video_id)
2494 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2496 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2497 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2499 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2500 first_xml, u'node_id')
2502 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2503 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2505 self.report_extraction(video_id)
2507 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2509 raise ExtractorError(u'Unable to extract video url')
2510 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2512 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2513 webpage, u'video title')
2515 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2516 webpage, u'description', fatal=False)
2518 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2519 webpage, u'thumbnail', fatal=False)
2525 'title': video_title,
2526 'description': video_description,
2527 'thumbnail': thumbnail,
2528 'uploader_id': video_uploader_id,
2531 class TeamcocoIE(InfoExtractor):
2532 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2534 def _real_extract(self, url):
2535 mobj = re.match(self._VALID_URL, url)
2537 raise ExtractorError(u'Invalid URL: %s' % url)
2538 url_title = mobj.group('url_title')
2539 webpage = self._download_webpage(url, url_title)
2541 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2542 webpage, u'video id')
2544 self.report_extraction(video_id)
2546 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2549 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2550 webpage, u'thumbnail', fatal=False)
2552 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2553 webpage, u'description', fatal=False)
2555 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2556 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2558 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2565 'title': video_title,
2566 'thumbnail': thumbnail,
2567 'description': video_description,
2570 class XHamsterIE(InfoExtractor):
2571 """Information Extractor for xHamster"""
2572 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2574 def _real_extract(self,url):
2575 mobj = re.match(self._VALID_URL, url)
2577 video_id = mobj.group('id')
2578 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2579 webpage = self._download_webpage(mrss_url, video_id)
2581 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2583 raise ExtractorError(u'Unable to extract media URL')
2584 if len(mobj.group('server')) == 0:
2585 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2587 video_url = mobj.group('server')+'/key='+mobj.group('file')
2588 video_extension = video_url.split('.')[-1]
2590 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2593 # Can't see the description anywhere in the UI
2594 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2595 # webpage, u'description', fatal=False)
2596 # if video_description: video_description = unescapeHTML(video_description)
2598 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2600 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2602 video_upload_date = None
2603 self._downloader.report_warning(u'Unable to extract upload date')
2605 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2606 webpage, u'uploader id', default=u'anonymous')
2608 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2609 webpage, u'thumbnail', fatal=False)
2614 'ext': video_extension,
2615 'title': video_title,
2616 # 'description': video_description,
2617 'upload_date': video_upload_date,
2618 'uploader_id': video_uploader_id,
2619 'thumbnail': video_thumbnail
2622 class HypemIE(InfoExtractor):
2623 """Information Extractor for hypem"""
2624 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2626 def _real_extract(self, url):
2627 mobj = re.match(self._VALID_URL, url)
2629 raise ExtractorError(u'Invalid URL: %s' % url)
2630 track_id = mobj.group(1)
2632 data = { 'ax': 1, 'ts': time.time() }
2633 data_encoded = compat_urllib_parse.urlencode(data)
2634 complete_url = url + "?" + data_encoded
2635 request = compat_urllib_request.Request(complete_url)
2636 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2637 cookie = urlh.headers.get('Set-Cookie', '')
2639 self.report_extraction(track_id)
2641 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2642 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2644 track_list = json.loads(html_tracks)
2645 track = track_list[u'tracks'][0]
2647 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2650 track_id = track[u"id"]
2651 artist = track[u"artist"]
2652 title = track[u"song"]
2654 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2655 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2656 request.add_header('cookie', cookie)
2657 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2659 song_data = json.loads(song_data_json)
2661 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2662 final_url = song_data[u"url"]
2672 class Vbox7IE(InfoExtractor):
2673 """Information Extractor for Vbox7"""
2674 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2676 def _real_extract(self,url):
2677 mobj = re.match(self._VALID_URL, url)
2679 raise ExtractorError(u'Invalid URL: %s' % url)
2680 video_id = mobj.group(1)
2682 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2683 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2684 redirect_url = urlh.geturl() + new_location
2685 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2687 title = self._html_search_regex(r'<title>(.*)</title>',
2688 webpage, u'title').split('/')[0].strip()
2691 info_url = "http://vbox7.com/play/magare.do"
2692 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2693 info_request = compat_urllib_request.Request(info_url, data)
2694 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2695 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2696 if info_response is None:
2697 raise ExtractorError(u'Unable to extract the media url')
2698 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2705 'thumbnail': thumbnail_url,
2709 def gen_extractors():
2710 """ Return a list of an instance of every supported extractor.
2711 The order does matter; the first extractor matched is the one handling the URL.
2714 YoutubePlaylistIE(),
2739 StanfordOpenClassroomIE(),
2749 WorldStarHipHopIE(),
2779 def get_info_extractor(ie_name):
2780 """Returns the info extractor class with the given ie_name"""
2781 return globals()[ie_name+'IE']