2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.metacafe import MetacafeIE
29 from .extractor.statigram import StatigramIE
30 from .extractor.photobucket import PhotobucketIE
31 from .extractor.vimeo import VimeoIE
32 from .extractor.yahoo import YahooIE
33 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
43 class ArteTvIE(InfoExtractor):
44 """arte.tv information extractor."""
46 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
47 _LIVE_URL = r'index-[0-9]+\.html$'
51 def fetch_webpage(self, url):
52 request = compat_urllib_request.Request(url)
54 self.report_download_webpage(url)
55 webpage = compat_urllib_request.urlopen(request).read()
56 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
57 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
58 except ValueError as err:
59 raise ExtractorError(u'Invalid URL: %s' % url)
62 def grep_webpage(self, url, regex, regexFlags, matchTuples):
63 page = self.fetch_webpage(url)
64 mobj = re.search(regex, page, regexFlags)
68 raise ExtractorError(u'Invalid URL: %s' % url)
70 for (i, key, err) in matchTuples:
71 if mobj.group(i) is None:
72 raise ExtractorError(err)
74 info[key] = mobj.group(i)
78 def extractLiveStream(self, url):
79 video_lang = url.split('/')[-4]
80 info = self.grep_webpage(
82 r'src="(.*?/videothek_js.*?\.js)',
85 (1, 'url', u'Invalid URL: %s' % url)
88 http_host = url.split('/')[2]
89 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
90 info = self.grep_webpage(
92 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
93 '(http://.*?\.swf).*?' +
97 (1, 'path', u'could not extract video path: %s' % url),
98 (2, 'player', u'could not extract video player: %s' % url),
99 (3, 'url', u'could not extract video url: %s' % url)
102 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
104 def extractPlus7Stream(self, url):
105 video_lang = url.split('/')[-3]
106 info = self.grep_webpage(
108 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
111 (1, 'url', u'Invalid URL: %s' % url)
114 next_url = compat_urllib_parse.unquote(info.get('url'))
115 info = self.grep_webpage(
117 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
120 (1, 'url', u'Could not find <video> tag: %s' % url)
123 next_url = compat_urllib_parse.unquote(info.get('url'))
125 info = self.grep_webpage(
127 r'<video id="(.*?)".*?>.*?' +
128 '<name>(.*?)</name>.*?' +
129 '<dateVideo>(.*?)</dateVideo>.*?' +
130 '<url quality="hd">(.*?)</url>',
133 (1, 'id', u'could not extract video id: %s' % url),
134 (2, 'title', u'could not extract video title: %s' % url),
135 (3, 'date', u'could not extract video date: %s' % url),
136 (4, 'url', u'could not extract video url: %s' % url)
141 'id': info.get('id'),
142 'url': compat_urllib_parse.unquote(info.get('url')),
143 'uploader': u'arte.tv',
144 'upload_date': unified_strdate(info.get('date')),
145 'title': info.get('title').decode('utf-8'),
151 def _real_extract(self, url):
152 video_id = url.split('/')[-1]
153 self.report_extraction(video_id)
155 if re.search(self._LIVE_URL, video_id) is not None:
156 self.extractLiveStream(url)
159 info = self.extractPlus7Stream(url)
164 class GenericIE(InfoExtractor):
165 """Generic last-resort information extractor."""
170 def report_download_webpage(self, video_id):
171 """Report webpage download."""
172 if not self._downloader.params.get('test', False):
173 self._downloader.report_warning(u'Falling back on generic information extractor.')
174 super(GenericIE, self).report_download_webpage(video_id)
176 def report_following_redirect(self, new_url):
177 """Report information extraction."""
178 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
180 def _test_redirect(self, url):
181 """Check if it is a redirect, like url shorteners, in case return the new url."""
182 class HeadRequest(compat_urllib_request.Request):
183 def get_method(self):
186 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
188 Subclass the HTTPRedirectHandler to make it use our
189 HeadRequest also on the redirected URL
191 def redirect_request(self, req, fp, code, msg, headers, newurl):
192 if code in (301, 302, 303, 307):
193 newurl = newurl.replace(' ', '%20')
194 newheaders = dict((k,v) for k,v in req.headers.items()
195 if k.lower() not in ("content-length", "content-type"))
196 return HeadRequest(newurl,
198 origin_req_host=req.get_origin_req_host(),
201 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
203 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
205 Fallback to GET if HEAD is not allowed (405 HTTP error)
207 def http_error_405(self, req, fp, code, msg, headers):
211 newheaders = dict((k,v) for k,v in req.headers.items()
212 if k.lower() not in ("content-length", "content-type"))
213 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
215 origin_req_host=req.get_origin_req_host(),
219 opener = compat_urllib_request.OpenerDirector()
220 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
221 HTTPMethodFallback, HEADRedirectHandler,
222 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
223 opener.add_handler(handler())
225 response = opener.open(HeadRequest(url))
227 raise ExtractorError(u'Invalid URL protocol')
228 new_url = response.geturl()
233 self.report_following_redirect(new_url)
236 def _real_extract(self, url):
237 new_url = self._test_redirect(url)
238 if new_url: return [self.url_result(new_url)]
240 video_id = url.split('/')[-1]
242 webpage = self._download_webpage(url, video_id)
243 except ValueError as err:
244 # since this is the last-resort InfoExtractor, if
245 # this error is thrown, it'll be thrown here
246 raise ExtractorError(u'Invalid URL: %s' % url)
248 self.report_extraction(video_id)
249 # Start with something easy: JW Player in SWFObject
250 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
252 # Broaden the search a little bit
253 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
255 # Broaden the search a little bit: JWPlayer JS loader
256 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
258 # Try to find twitter cards info
259 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
261 # We look for Open Graph info:
262 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
263 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
264 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
265 if m_video_type is not None:
266 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
268 raise ExtractorError(u'Invalid URL: %s' % url)
270 # It's possible that one of the regexes
271 # matched, but returned an empty group:
272 if mobj.group(1) is None:
273 raise ExtractorError(u'Invalid URL: %s' % url)
275 video_url = compat_urllib_parse.unquote(mobj.group(1))
276 video_id = os.path.basename(video_url)
278 # here's a fun little line of code for you:
279 video_extension = os.path.splitext(video_id)[1][1:]
280 video_id = os.path.splitext(video_id)[0]
282 # it's tempting to parse this further, but you would
283 # have to take into account all the variations like
284 # Video Title - Site Name
285 # Site Name | Video Title
286 # Video Title - Tagline | Site Name
287 # and so on and so forth; it's just not practical
288 video_title = self._html_search_regex(r'<title>(.*)</title>',
289 webpage, u'video title')
291 # video uploader is domain name
292 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
293 url, u'video uploader')
298 'uploader': video_uploader,
300 'title': video_title,
301 'ext': video_extension,
305 class YoutubeSearchIE(SearchInfoExtractor):
306 """Information Extractor for YouTube search queries."""
307 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
309 IE_NAME = u'youtube:search'
310 _SEARCH_KEY = 'ytsearch'
312 def report_download_page(self, query, pagenum):
313 """Report attempt to download search page with given number."""
314 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
316 def _get_n_results(self, query, n):
317 """Get a specified number of results for a query"""
323 while (50 * pagenum) < limit:
324 self.report_download_page(query, pagenum+1)
325 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
326 request = compat_urllib_request.Request(result_url)
328 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
329 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
330 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
331 api_response = json.loads(data)['data']
333 if not 'items' in api_response:
334 raise ExtractorError(u'[youtube] No video results')
336 new_ids = list(video['id'] for video in api_response['items'])
339 limit = min(n, api_response['totalItems'])
342 if len(video_ids) > n:
343 video_ids = video_ids[:n]
344 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
345 return self.playlist_result(videos, query)
348 class GoogleSearchIE(SearchInfoExtractor):
349 """Information Extractor for Google Video search queries."""
350 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
352 IE_NAME = u'video.google:search'
353 _SEARCH_KEY = 'gvsearch'
355 def _get_n_results(self, query, n):
356 """Get a specified number of results for a query"""
364 for pagenum in itertools.count(1):
365 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
366 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
367 note='Downloading result page ' + str(pagenum))
369 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
374 res['entries'].append(e)
376 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
379 class YahooSearchIE(SearchInfoExtractor):
380 """Information Extractor for Yahoo! Video search queries."""
383 IE_NAME = u'screen.yahoo:search'
384 _SEARCH_KEY = 'yvsearch'
386 def _get_n_results(self, query, n):
387 """Get a specified number of results for a query"""
394 for pagenum in itertools.count(0):
395 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
396 webpage = self._download_webpage(result_url, query,
397 note='Downloading results page '+str(pagenum+1))
398 info = json.loads(webpage)
400 results = info[u'results']
402 for (i, r) in enumerate(results):
403 if (pagenum * 30) +i >= n:
405 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
406 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
407 res['entries'].append(e)
408 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
414 class BlipTVUserIE(InfoExtractor):
415 """Information Extractor for blip.tv users."""
417 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
419 IE_NAME = u'blip.tv:user'
421 def _real_extract(self, url):
423 mobj = re.match(self._VALID_URL, url)
425 raise ExtractorError(u'Invalid URL: %s' % url)
427 username = mobj.group(1)
429 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
431 page = self._download_webpage(url, username, u'Downloading user page')
432 mobj = re.search(r'data-users-id="([^"]+)"', page)
433 page_base = page_base % mobj.group(1)
436 # Download video ids using BlipTV Ajax calls. Result size per
437 # query is limited (currently to 12 videos) so we need to query
438 # page by page until there are no video ids - it means we got
445 url = page_base + "&page=" + str(pagenum)
446 page = self._download_webpage(url, username,
447 u'Downloading video ids from page %d' % pagenum)
449 # Extract video identifiers
452 for mobj in re.finditer(r'href="/([^"]+)"', page):
453 if mobj.group(1) not in ids_in_page:
454 ids_in_page.append(unescapeHTML(mobj.group(1)))
456 video_ids.extend(ids_in_page)
458 # A little optimization - if current page is not
459 # "full", ie. does not contain PAGE_SIZE video ids then
460 # we can assume that this page is the last one - there
461 # are no more ids on further pages - no need to query
464 if len(ids_in_page) < self._PAGE_SIZE:
469 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
470 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
471 return [self.playlist_result(url_entries, playlist_title = username)]
474 class DepositFilesIE(InfoExtractor):
475 """Information extractor for depositfiles.com"""
477 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
479 def _real_extract(self, url):
480 file_id = url.split('/')[-1]
481 # Rebuild url in english locale
482 url = 'http://depositfiles.com/en/files/' + file_id
484 # Retrieve file webpage with 'Free download' button pressed
485 free_download_indication = { 'gateway_result' : '1' }
486 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
488 self.report_download_webpage(file_id)
489 webpage = compat_urllib_request.urlopen(request).read()
490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
493 # Search for the real file URL
494 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
495 if (mobj is None) or (mobj.group(1) is None):
496 # Try to figure out reason of the error.
497 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
498 if (mobj is not None) and (mobj.group(1) is not None):
499 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
500 raise ExtractorError(u'%s' % restriction_message)
502 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
504 file_url = mobj.group(1)
505 file_extension = os.path.splitext(file_url)[1][1:]
507 # Search for file title
508 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
511 'id': file_id.decode('utf-8'),
512 'url': file_url.decode('utf-8'),
516 'ext': file_extension.decode('utf-8'),
520 class FacebookIE(InfoExtractor):
521 """Information Extractor for Facebook"""
523 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
524 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
525 _NETRC_MACHINE = 'facebook'
526 IE_NAME = u'facebook'
528 def report_login(self):
529 """Report attempt to log in."""
530 self.to_screen(u'Logging in')
532 def _real_initialize(self):
533 if self._downloader is None:
538 downloader_params = self._downloader.params
540 # Attempt to use provided username and password or .netrc data
541 if downloader_params.get('username', None) is not None:
542 useremail = downloader_params['username']
543 password = downloader_params['password']
544 elif downloader_params.get('usenetrc', False):
546 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
551 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
552 except (IOError, netrc.NetrcParseError) as err:
553 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
556 if useremail is None:
565 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
568 login_results = compat_urllib_request.urlopen(request).read()
569 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
576 def _real_extract(self, url):
577 mobj = re.match(self._VALID_URL, url)
579 raise ExtractorError(u'Invalid URL: %s' % url)
580 video_id = mobj.group('ID')
582 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
583 webpage = self._download_webpage(url, video_id)
585 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
586 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
587 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
589 raise ExtractorError(u'Cannot parse data')
590 data = dict(json.loads(m.group(1)))
591 params_raw = compat_urllib_parse.unquote(data['params'])
592 params = json.loads(params_raw)
593 video_data = params['video_data'][0]
594 video_url = video_data.get('hd_src')
596 video_url = video_data['sd_src']
598 raise ExtractorError(u'Cannot find video URL')
599 video_duration = int(video_data['video_duration'])
600 thumbnail = video_data['thumbnail_src']
602 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
607 'title': video_title,
610 'duration': video_duration,
611 'thumbnail': thumbnail,
616 class BlipTVIE(InfoExtractor):
617 """Information extractor for blip.tv"""
619 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
620 _URL_EXT = r'^.*\.([a-z0-9]+)$'
623 def report_direct_download(self, title):
624 """Report information extraction."""
625 self.to_screen(u'%s: Direct download detected' % title)
627 def _real_extract(self, url):
628 mobj = re.match(self._VALID_URL, url)
630 raise ExtractorError(u'Invalid URL: %s' % url)
632 # See https://github.com/rg3/youtube-dl/issues/857
633 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
634 if api_mobj is not None:
635 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
636 urlp = compat_urllib_parse_urlparse(url)
637 if urlp.path.startswith('/play/'):
638 request = compat_urllib_request.Request(url)
639 response = compat_urllib_request.urlopen(request)
640 redirecturl = response.geturl()
641 rurlp = compat_urllib_parse_urlparse(redirecturl)
642 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
643 url = 'http://blip.tv/a/a-' + file_id
644 return self._real_extract(url)
651 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
652 request = compat_urllib_request.Request(json_url)
653 request.add_header('User-Agent', 'iTunes/10.6.1')
654 self.report_extraction(mobj.group(1))
657 urlh = compat_urllib_request.urlopen(request)
658 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
659 basename = url.split('/')[-1]
660 title,ext = os.path.splitext(basename)
661 title = title.decode('UTF-8')
662 ext = ext.replace('.', '')
663 self.report_direct_download(title)
673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
674 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
675 if info is None: # Regular URL
677 json_code_bytes = urlh.read()
678 json_code = json_code_bytes.decode('utf-8')
679 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
680 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
683 json_data = json.loads(json_code)
684 if 'Post' in json_data:
685 data = json_data['Post']
689 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
690 video_url = data['media']['url']
691 umobj = re.match(self._URL_EXT, video_url)
693 raise ValueError('Can not determine filename extension')
697 'id': data['item_id'],
699 'uploader': data['display_name'],
700 'upload_date': upload_date,
701 'title': data['title'],
703 'format': data['media']['mimeType'],
704 'thumbnail': data['thumbnailUrl'],
705 'description': data['description'],
706 'player_url': data['embedUrl'],
707 'user_agent': 'iTunes/10.6.1',
709 except (ValueError,KeyError) as err:
710 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
715 class MyVideoIE(InfoExtractor):
716 """Information Extractor for myvideo.de."""
718 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
721 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
722 # Released into the Public Domain by Tristan Fischer on 2013-05-19
723 # https://github.com/rg3/youtube-dl/pull/842
724 def __rc4crypt(self,data, key):
726 box = list(range(256))
727 for i in list(range(256)):
728 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
729 box[i], box[x] = box[x], box[i]
735 y = (y + box[x]) % 256
736 box[x], box[y] = box[y], box[x]
737 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
741 return hashlib.md5(s).hexdigest().encode()
743 def _real_extract(self,url):
744 mobj = re.match(self._VALID_URL, url)
746 raise ExtractorError(u'invalid URL: %s' % url)
748 video_id = mobj.group(1)
751 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
752 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
753 b'TnpsbA0KTVRkbU1tSTRNdz09'
757 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
758 webpage = self._download_webpage(webpage_url, video_id)
760 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
762 self.report_extraction(video_id)
763 video_url = mobj.group(1) + '.flv'
765 video_title = self._html_search_regex('<title>([^<]+)</title>',
768 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
775 'title': video_title,
780 mobj = re.search('var flashvars={(.+?)}', webpage)
782 raise ExtractorError(u'Unable to extract video')
787 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
788 if not a == '_encxml':
791 encxml = compat_urllib_parse.unquote(b)
792 if not params.get('domain'):
793 params['domain'] = 'www.myvideo.de'
794 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
795 if 'flash_playertype=MTV' in xmldata_url:
796 self._downloader.report_warning(u'avoiding MTV player')
798 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
799 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
803 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
804 enc_data_b = binascii.unhexlify(enc_data)
806 base64.b64decode(base64.b64decode(GK)) +
808 str(video_id).encode('utf-8')
811 dec_data = self.__rc4crypt(enc_data_b, sk)
814 self.report_extraction(video_id)
817 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
819 video_url = compat_urllib_parse.unquote(mobj.group(1))
820 if 'myvideo2flash' in video_url:
821 self._downloader.report_warning(u'forcing RTMPT ...')
822 video_url = video_url.replace('rtmpe://', 'rtmpt://')
825 # extract non rtmp videos
826 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
828 raise ExtractorError(u'unable to extract url')
829 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
831 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
832 video_file = compat_urllib_parse.unquote(video_file)
834 if not video_file.endswith('f4m'):
835 ppath, prefix = video_file.split('.')
836 video_playpath = '%s:%s' % (prefix, ppath)
837 video_hls_playlist = ''
840 video_hls_playlist = (
841 video_filepath + video_file
842 ).replace('.f4m', '.m3u8')
844 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
845 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
847 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
856 'title': video_title,
858 'play_path': video_playpath,
859 'video_file': video_file,
860 'video_hls_playlist': video_hls_playlist,
861 'player_url': video_swfobj,
865 class ComedyCentralIE(InfoExtractor):
866 """Information extractor for The Daily Show and Colbert Report """
868 # urls can be abbreviations like :thedailyshow or :colbert
869 # urls for episodes like:
870 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
871 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
872 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
873 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
874 |(https?://)?(www\.)?
875 (?P<showname>thedailyshow|colbertnation)\.com/
876 (full-episodes/(?P<episode>.*)|
878 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
879 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
882 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
884 _video_extensions = {
892 _video_dimensions = {
902 def suitable(cls, url):
903 """Receives a URL and returns True if suitable for this IE."""
904 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
906 def _print_formats(self, formats):
907 print('Available formats:')
909 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
912 def _real_extract(self, url):
913 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
915 raise ExtractorError(u'Invalid URL: %s' % url)
917 if mobj.group('shortname'):
918 if mobj.group('shortname') in ('tds', 'thedailyshow'):
919 url = u'http://www.thedailyshow.com/full-episodes/'
921 url = u'http://www.colbertnation.com/full-episodes/'
922 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
923 assert mobj is not None
925 if mobj.group('clip'):
926 if mobj.group('showname') == 'thedailyshow':
927 epTitle = mobj.group('tdstitle')
929 epTitle = mobj.group('cntitle')
932 dlNewest = not mobj.group('episode')
934 epTitle = mobj.group('showname')
936 epTitle = mobj.group('episode')
938 self.report_extraction(epTitle)
939 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
941 url = htmlHandle.geturl()
942 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
944 raise ExtractorError(u'Invalid redirected URL: ' + url)
945 if mobj.group('episode') == '':
946 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
947 epTitle = mobj.group('episode')
949 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
951 if len(mMovieParams) == 0:
952 # The Colbert Report embeds the information in a without
953 # a URL prefix; so extract the alternate reference
954 # and then add the URL prefix manually.
956 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
957 if len(altMovieParams) == 0:
958 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
960 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
962 uri = mMovieParams[0][1]
963 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
964 indexXml = self._download_webpage(indexUrl, epTitle,
965 u'Downloading show index',
966 u'unable to download episode index')
970 idoc = xml.etree.ElementTree.fromstring(indexXml)
971 itemEls = idoc.findall('.//item')
972 for partNum,itemEl in enumerate(itemEls):
973 mediaId = itemEl.findall('./guid')[0].text
974 shortMediaId = mediaId.split(':')[-1]
975 showId = mediaId.split(':')[-2].replace('.com', '')
976 officialTitle = itemEl.findall('./title')[0].text
977 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
979 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
980 compat_urllib_parse.urlencode({'uri': mediaId}))
981 configXml = self._download_webpage(configUrl, epTitle,
982 u'Downloading configuration for %s' % shortMediaId)
984 cdoc = xml.etree.ElementTree.fromstring(configXml)
986 for rendition in cdoc.findall('.//rendition'):
987 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
991 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
994 if self._downloader.params.get('listformats', None):
995 self._print_formats([i[0] for i in turls])
998 # For now, just pick the highest bitrate
999 format,rtmp_video_url = turls[-1]
1001 # Get the format arg from the arg stream
1002 req_format = self._downloader.params.get('format', None)
1004 # Select format if we can find one
1007 format, rtmp_video_url = f, v
1010 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1012 raise ExtractorError(u'Cannot transform RTMP url')
1013 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1014 video_url = base + m.group('finalid')
1016 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1021 'upload_date': officialDate,
1026 'description': officialTitle,
1028 results.append(info)
1033 class EscapistIE(InfoExtractor):
1034 """Information extractor for The Escapist """
1036 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1037 IE_NAME = u'escapist'
1039 def _real_extract(self, url):
1040 mobj = re.match(self._VALID_URL, url)
1042 raise ExtractorError(u'Invalid URL: %s' % url)
1043 showName = mobj.group('showname')
1044 videoId = mobj.group('episode')
1046 self.report_extraction(videoId)
1047 webpage = self._download_webpage(url, videoId)
1049 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1050 webpage, u'description', fatal=False)
1052 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1053 webpage, u'thumbnail', fatal=False)
1055 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1056 webpage, u'player url')
1058 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1059 webpage, u'player url').split(' : ')[-1]
1061 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1062 configUrl = compat_urllib_parse.unquote(configUrl)
1064 configJSON = self._download_webpage(configUrl, videoId,
1065 u'Downloading configuration',
1066 u'unable to download configuration')
1068 # Technically, it's JavaScript, not JSON
1069 configJSON = configJSON.replace("'", '"')
1072 config = json.loads(configJSON)
1073 except (ValueError,) as err:
1074 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1076 playlist = config['playlist']
1077 videoUrl = playlist[1]['url']
1082 'uploader': showName,
1083 'upload_date': None,
1086 'thumbnail': imgUrl,
1087 'description': videoDesc,
1088 'player_url': playerUrl,
1093 class CollegeHumorIE(InfoExtractor):
1094 """Information extractor for collegehumor.com"""
1097 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1098 IE_NAME = u'collegehumor'
1100 def report_manifest(self, video_id):
1101 """Report information extraction."""
1102 self.to_screen(u'%s: Downloading XML manifest' % video_id)
1104 def _real_extract(self, url):
1105 mobj = re.match(self._VALID_URL, url)
1107 raise ExtractorError(u'Invalid URL: %s' % url)
1108 video_id = mobj.group('videoid')
1113 'upload_date': None,
1116 self.report_extraction(video_id)
1117 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1119 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1120 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1121 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1123 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1125 videoNode = mdoc.findall('./video')[0]
1126 info['description'] = videoNode.findall('./description')[0].text
1127 info['title'] = videoNode.findall('./caption')[0].text
1128 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1129 manifest_url = videoNode.findall('./file')[0].text
1131 raise ExtractorError(u'Invalid metadata XML file')
1133 manifest_url += '?hdcore=2.10.3'
1134 self.report_manifest(video_id)
1136 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1137 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1138 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1140 adoc = xml.etree.ElementTree.fromstring(manifestXml)
1142 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1143 node_id = media_node.attrib['url']
1144 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1145 except IndexError as err:
1146 raise ExtractorError(u'Invalid manifest file')
1148 url_pr = compat_urllib_parse_urlparse(manifest_url)
1149 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1156 class XVideosIE(InfoExtractor):
1157 """Information extractor for xvideos.com"""
1159 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1160 IE_NAME = u'xvideos'
1162 def _real_extract(self, url):
1163 mobj = re.match(self._VALID_URL, url)
1165 raise ExtractorError(u'Invalid URL: %s' % url)
1166 video_id = mobj.group(1)
1168 webpage = self._download_webpage(url, video_id)
1170 self.report_extraction(video_id)
1173 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1174 webpage, u'video URL'))
1177 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1180 # Extract video thumbnail
1181 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1182 webpage, u'thumbnail', fatal=False)
1188 'upload_date': None,
1189 'title': video_title,
1191 'thumbnail': video_thumbnail,
1192 'description': None,
1198 class SoundcloudIE(InfoExtractor):
1199 """Information extractor for soundcloud.com
1200 To access the media, the uid of the song and a stream token
1201 must be extracted from the page source and the script must make
1202 a request to media.soundcloud.com/crossdomain.xml. Then
1203 the media can be grabbed by requesting from an url composed
1204 of the stream token and uid
1207 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1208 IE_NAME = u'soundcloud'
1210 def report_resolve(self, video_id):
1211 """Report information extraction."""
1212 self.to_screen(u'%s: Resolving id' % video_id)
1214 def _real_extract(self, url):
1215 mobj = re.match(self._VALID_URL, url)
1217 raise ExtractorError(u'Invalid URL: %s' % url)
1219 # extract uploader (which is in the url)
1220 uploader = mobj.group(1)
1221 # extract simple title (uploader + slug of song title)
1222 slug_title = mobj.group(2)
1223 simple_title = uploader + u'-' + slug_title
1224 full_title = '%s/%s' % (uploader, slug_title)
1226 self.report_resolve(full_title)
1228 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1229 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1230 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1232 info = json.loads(info_json)
1233 video_id = info['id']
1234 self.report_extraction(full_title)
1236 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1237 stream_json = self._download_webpage(streams_url, full_title,
1238 u'Downloading stream definitions',
1239 u'unable to download stream definitions')
1241 streams = json.loads(stream_json)
1242 mediaURL = streams['http_mp3_128_url']
1243 upload_date = unified_strdate(info['created_at'])
1248 'uploader': info['user']['username'],
1249 'upload_date': upload_date,
1250 'title': info['title'],
1252 'description': info['description'],
1255 class SoundcloudSetIE(InfoExtractor):
1256 """Information extractor for soundcloud.com sets
1257 To access the media, the uid of the song and a stream token
1258 must be extracted from the page source and the script must make
1259 a request to media.soundcloud.com/crossdomain.xml. Then
1260 the media can be grabbed by requesting from an url composed
1261 of the stream token and uid
1264 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1265 IE_NAME = u'soundcloud:set'
1267 def report_resolve(self, video_id):
1268 """Report information extraction."""
1269 self.to_screen(u'%s: Resolving id' % video_id)
1271 def _real_extract(self, url):
1272 mobj = re.match(self._VALID_URL, url)
1274 raise ExtractorError(u'Invalid URL: %s' % url)
1276 # extract uploader (which is in the url)
1277 uploader = mobj.group(1)
1278 # extract simple title (uploader + slug of song title)
1279 slug_title = mobj.group(2)
1280 simple_title = uploader + u'-' + slug_title
1281 full_title = '%s/sets/%s' % (uploader, slug_title)
1283 self.report_resolve(full_title)
1285 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1286 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1287 info_json = self._download_webpage(resolv_url, full_title)
1290 info = json.loads(info_json)
1291 if 'errors' in info:
1292 for err in info['errors']:
1293 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1296 self.report_extraction(full_title)
1297 for track in info['tracks']:
1298 video_id = track['id']
1300 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1301 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1303 self.report_extraction(video_id)
1304 streams = json.loads(stream_json)
1305 mediaURL = streams['http_mp3_128_url']
1310 'uploader': track['user']['username'],
1311 'upload_date': unified_strdate(track['created_at']),
1312 'title': track['title'],
1314 'description': track['description'],
1319 class InfoQIE(InfoExtractor):
1320 """Information extractor for infoq.com"""
1321 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1323 def _real_extract(self, url):
1324 mobj = re.match(self._VALID_URL, url)
1326 raise ExtractorError(u'Invalid URL: %s' % url)
1328 webpage = self._download_webpage(url, video_id=url)
1329 self.report_extraction(url)
1332 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1334 raise ExtractorError(u'Unable to extract video url')
1335 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1336 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1339 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1342 # Extract description
1343 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1344 webpage, u'description', fatal=False)
1346 video_filename = video_url.split('/')[-1]
1347 video_id, extension = video_filename.split('.')
1353 'upload_date': None,
1354 'title': video_title,
1355 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1357 'description': video_description,
1362 class MixcloudIE(InfoExtractor):
1363 """Information extractor for www.mixcloud.com"""
1365 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1366 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1367 IE_NAME = u'mixcloud'
1369 def report_download_json(self, file_id):
1370 """Report JSON download."""
1371 self.to_screen(u'Downloading json')
1373 def get_urls(self, jsonData, fmt, bitrate='best'):
1374 """Get urls from 'audio_formats' section in json"""
1377 bitrate_list = jsonData[fmt]
1378 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1379 bitrate = max(bitrate_list) # select highest
1381 url_list = jsonData[fmt][bitrate]
1382 except TypeError: # we have no bitrate info.
1383 url_list = jsonData[fmt]
1386 def check_urls(self, url_list):
1387 """Returns 1st active url from list"""
1388 for url in url_list:
1390 compat_urllib_request.urlopen(url)
1392 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1397 def _print_formats(self, formats):
1398 print('Available formats:')
1399 for fmt in formats.keys():
1400 for b in formats[fmt]:
1402 ext = formats[fmt][b][0]
1403 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1404 except TypeError: # we have no bitrate info
1405 ext = formats[fmt][0]
1406 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1409 def _real_extract(self, url):
1410 mobj = re.match(self._VALID_URL, url)
1412 raise ExtractorError(u'Invalid URL: %s' % url)
1413 # extract uploader & filename from url
1414 uploader = mobj.group(1).decode('utf-8')
1415 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1417 # construct API request
1418 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1419 # retrieve .json file with links to files
1420 request = compat_urllib_request.Request(file_url)
1422 self.report_download_json(file_url)
1423 jsonData = compat_urllib_request.urlopen(request).read()
1424 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1425 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1428 json_data = json.loads(jsonData)
1429 player_url = json_data['player_swf_url']
1430 formats = dict(json_data['audio_formats'])
1432 req_format = self._downloader.params.get('format', None)
1435 if self._downloader.params.get('listformats', None):
1436 self._print_formats(formats)
1439 if req_format is None or req_format == 'best':
1440 for format_param in formats.keys():
1441 url_list = self.get_urls(formats, format_param)
1443 file_url = self.check_urls(url_list)
1444 if file_url is not None:
1447 if req_format not in formats:
1448 raise ExtractorError(u'Format is not available')
1450 url_list = self.get_urls(formats, req_format)
1451 file_url = self.check_urls(url_list)
1452 format_param = req_format
1455 'id': file_id.decode('utf-8'),
1456 'url': file_url.decode('utf-8'),
1457 'uploader': uploader.decode('utf-8'),
1458 'upload_date': None,
1459 'title': json_data['name'],
1460 'ext': file_url.split('.')[-1].decode('utf-8'),
1461 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1462 'thumbnail': json_data['thumbnail_url'],
1463 'description': json_data['description'],
1464 'player_url': player_url.decode('utf-8'),
1467 class StanfordOpenClassroomIE(InfoExtractor):
1468 """Information extractor for Stanford's Open ClassRoom"""
1470 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1471 IE_NAME = u'stanfordoc'
1473 def _real_extract(self, url):
1474 mobj = re.match(self._VALID_URL, url)
1476 raise ExtractorError(u'Invalid URL: %s' % url)
1478 if mobj.group('course') and mobj.group('video'): # A specific video
1479 course = mobj.group('course')
1480 video = mobj.group('video')
1482 'id': course + '_' + video,
1484 'upload_date': None,
1487 self.report_extraction(info['id'])
1488 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1489 xmlUrl = baseUrl + video + '.xml'
1491 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1493 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1494 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1496 info['title'] = mdoc.findall('./title')[0].text
1497 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1499 raise ExtractorError(u'Invalid metadata XML file')
1500 info['ext'] = info['url'].rpartition('.')[2]
1502 elif mobj.group('course'): # A course page
1503 course = mobj.group('course')
1508 'upload_date': None,
1511 coursepage = self._download_webpage(url, info['id'],
1512 note='Downloading course info page',
1513 errnote='Unable to download course info page')
1515 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1517 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1518 coursepage, u'description', fatal=False)
1520 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1523 'type': 'reference',
1524 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1528 for entry in info['list']:
1529 assert entry['type'] == 'reference'
1530 results += self.extract(entry['url'])
1534 'id': 'Stanford OpenClassroom',
1537 'upload_date': None,
1540 self.report_download_webpage(info['id'])
1541 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1543 rootpage = compat_urllib_request.urlopen(rootURL).read()
1544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1545 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1547 info['title'] = info['id']
1549 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1552 'type': 'reference',
1553 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1558 for entry in info['list']:
1559 assert entry['type'] == 'reference'
1560 results += self.extract(entry['url'])
1563 class MTVIE(InfoExtractor):
1564 """Information extractor for MTV.com"""
1566 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1569 def _real_extract(self, url):
1570 mobj = re.match(self._VALID_URL, url)
1572 raise ExtractorError(u'Invalid URL: %s' % url)
1573 if not mobj.group('proto'):
1574 url = 'http://' + url
1575 video_id = mobj.group('videoid')
1577 webpage = self._download_webpage(url, video_id)
1579 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1580 webpage, u'song name', fatal=False)
1582 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1585 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1586 webpage, u'mtvn_uri', fatal=False)
1588 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1589 webpage, u'content id', fatal=False)
1591 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1592 self.report_extraction(video_id)
1593 request = compat_urllib_request.Request(videogen_url)
1595 metadataXml = compat_urllib_request.urlopen(request).read()
1596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1597 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1599 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1600 renditions = mdoc.findall('.//rendition')
1602 # For now, always pick the highest quality.
1603 rendition = renditions[-1]
1606 _,_,ext = rendition.attrib['type'].partition('/')
1607 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1608 video_url = rendition.find('./src').text
1610 raise ExtractorError('Invalid rendition field.')
1615 'uploader': performer,
1616 'upload_date': None,
1617 'title': video_title,
1625 class YoukuIE(InfoExtractor):
1626 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1629 nowTime = int(time.time() * 1000)
1630 random1 = random.randint(1000,1998)
1631 random2 = random.randint(1000,9999)
1633 return "%d%d%d" %(nowTime,random1,random2)
1635 def _get_file_ID_mix_string(self, seed):
1637 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1639 for i in range(len(source)):
1640 seed = (seed * 211 + 30031 ) % 65536
1641 index = math.floor(seed / 65536 * len(source) )
1642 mixed.append(source[int(index)])
1643 source.remove(source[int(index)])
1644 #return ''.join(mixed)
1647 def _get_file_id(self, fileId, seed):
1648 mixed = self._get_file_ID_mix_string(seed)
1649 ids = fileId.split('*')
1653 realId.append(mixed[int(ch)])
1654 return ''.join(realId)
1656 def _real_extract(self, url):
1657 mobj = re.match(self._VALID_URL, url)
1659 raise ExtractorError(u'Invalid URL: %s' % url)
1660 video_id = mobj.group('ID')
1662 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1664 jsondata = self._download_webpage(info_url, video_id)
1666 self.report_extraction(video_id)
1668 config = json.loads(jsondata)
1670 video_title = config['data'][0]['title']
1671 seed = config['data'][0]['seed']
1673 format = self._downloader.params.get('format', None)
1674 supported_format = list(config['data'][0]['streamfileids'].keys())
1676 if format is None or format == 'best':
1677 if 'hd2' in supported_format:
1682 elif format == 'worst':
1690 fileid = config['data'][0]['streamfileids'][format]
1691 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1692 except (UnicodeDecodeError, ValueError, KeyError):
1693 raise ExtractorError(u'Unable to extract info section')
1696 sid = self._gen_sid()
1697 fileid = self._get_file_id(fileid, seed)
1699 #column 8,9 of fileid represent the segment number
1700 #fileid[7:9] should be changed
1701 for index, key in enumerate(keys):
1703 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1704 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1707 'id': '%s_part%02d' % (video_id, index),
1708 'url': download_url,
1710 'upload_date': None,
1711 'title': video_title,
1714 files_info.append(info)
1719 class XNXXIE(InfoExtractor):
1720 """Information extractor for xnxx.com"""
1722 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1724 VIDEO_URL_RE = r'flv_url=(.*?)&'
1725 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1726 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1728 def _real_extract(self, url):
1729 mobj = re.match(self._VALID_URL, url)
1731 raise ExtractorError(u'Invalid URL: %s' % url)
1732 video_id = mobj.group(1)
1734 # Get webpage content
1735 webpage = self._download_webpage(url, video_id)
1737 video_url = self._search_regex(self.VIDEO_URL_RE,
1738 webpage, u'video URL')
1739 video_url = compat_urllib_parse.unquote(video_url)
1741 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1744 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1745 webpage, u'thumbnail', fatal=False)
1751 'upload_date': None,
1752 'title': video_title,
1754 'thumbnail': video_thumbnail,
1755 'description': None,
1759 class GooglePlusIE(InfoExtractor):
1760 """Information extractor for plus.google.com."""
1762 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1763 IE_NAME = u'plus.google'
1765 def _real_extract(self, url):
1766 # Extract id from URL
1767 mobj = re.match(self._VALID_URL, url)
1769 raise ExtractorError(u'Invalid URL: %s' % url)
1771 post_url = mobj.group(0)
1772 video_id = mobj.group(1)
1774 video_extension = 'flv'
1776 # Step 1, Retrieve post webpage to extract further information
1777 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1779 self.report_extraction(video_id)
1781 # Extract update date
1782 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1783 webpage, u'upload date', fatal=False)
1785 # Convert timestring to a format suitable for filename
1786 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1787 upload_date = upload_date.strftime('%Y%m%d')
1790 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1791 webpage, u'uploader', fatal=False)
1794 # Get the first line for title
1795 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1796 webpage, 'title', default=u'NA')
1798 # Step 2, Stimulate clicking the image box to launch video
1799 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1800 webpage, u'video page URL')
1801 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1803 # Extract video links on video page
1804 """Extract video links of all sizes"""
1805 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1806 mobj = re.findall(pattern, webpage)
1808 raise ExtractorError(u'Unable to extract video links')
1810 # Sort in resolution
1811 links = sorted(mobj)
1813 # Choose the lowest of the sort, i.e. highest resolution
1814 video_url = links[-1]
1815 # Only get the url. The resolution part in the tuple has no use anymore
1816 video_url = video_url[-1]
1817 # Treat escaped \u0026 style hex
1819 video_url = video_url.decode("unicode_escape")
1820 except AttributeError: # Python 3
1821 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1827 'uploader': uploader,
1828 'upload_date': upload_date,
1829 'title': video_title,
1830 'ext': video_extension,
1833 class NBAIE(InfoExtractor):
1834 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1837 def _real_extract(self, url):
1838 mobj = re.match(self._VALID_URL, url)
1840 raise ExtractorError(u'Invalid URL: %s' % url)
1842 video_id = mobj.group(1)
1844 webpage = self._download_webpage(url, video_id)
1846 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1848 shortened_video_id = video_id.rpartition('/')[2]
1849 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1850 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1852 # It isn't there in the HTML it returns to us
1853 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1855 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1858 'id': shortened_video_id,
1862 # 'uploader_date': uploader_date,
1863 'description': description,
1867 class JustinTVIE(InfoExtractor):
1868 """Information extractor for justin.tv and twitch.tv"""
1869 # TODO: One broadcast may be split into multiple videos. The key
1870 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1871 # starts at 1 and increases. Can we treat all parts as one video?
1873 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1875 (?P<channelid>[^/]+)|
1876 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1877 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1881 _JUSTIN_PAGE_LIMIT = 100
1882 IE_NAME = u'justin.tv'
1884 def report_download_page(self, channel, offset):
1885 """Report attempt to download a single page of videos."""
1886 self.to_screen(u'%s: Downloading video information from %d to %d' %
1887 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1889 # Return count of items, list of *valid* items
1890 def _parse_page(self, url, video_id):
1891 webpage = self._download_webpage(url, video_id,
1892 u'Downloading video info JSON',
1893 u'unable to download video info JSON')
1895 response = json.loads(webpage)
1896 if type(response) != list:
1897 error_text = response.get('error', 'unknown error')
1898 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1900 for clip in response:
1901 video_url = clip['video_file_url']
1903 video_extension = os.path.splitext(video_url)[1][1:]
1904 video_date = re.sub('-', '', clip['start_time'][:10])
1905 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1906 video_id = clip['id']
1907 video_title = clip.get('title', video_id)
1911 'title': video_title,
1912 'uploader': clip.get('channel_name', video_uploader_id),
1913 'uploader_id': video_uploader_id,
1914 'upload_date': video_date,
1915 'ext': video_extension,
1917 return (len(response), info)
1919 def _real_extract(self, url):
1920 mobj = re.match(self._VALID_URL, url)
1922 raise ExtractorError(u'invalid URL: %s' % url)
1924 api_base = 'http://api.justin.tv'
1926 if mobj.group('channelid'):
1928 video_id = mobj.group('channelid')
1929 api = api_base + '/channel/archives/%s.json' % video_id
1930 elif mobj.group('chapterid'):
1931 chapter_id = mobj.group('chapterid')
1933 webpage = self._download_webpage(url, chapter_id)
1934 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1936 raise ExtractorError(u'Cannot find archive of a chapter')
1937 archive_id = m.group(1)
1939 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1940 chapter_info_xml = self._download_webpage(api, chapter_id,
1941 note=u'Downloading chapter information',
1942 errnote=u'Chapter information download failed')
1943 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1944 for a in doc.findall('.//archive'):
1945 if archive_id == a.find('./id').text:
1948 raise ExtractorError(u'Could not find chapter in chapter information')
1950 video_url = a.find('./video_file_url').text
1951 video_ext = video_url.rpartition('.')[2] or u'flv'
1953 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1954 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1955 note='Downloading chapter metadata',
1956 errnote='Download of chapter metadata failed')
1957 chapter_info = json.loads(chapter_info_json)
1959 bracket_start = int(doc.find('.//bracket_start').text)
1960 bracket_end = int(doc.find('.//bracket_end').text)
1962 # TODO determine start (and probably fix up file)
1963 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1964 #video_url += u'?start=' + TODO:start_timestamp
1965 # bracket_start is 13290, but we want 51670615
1966 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1967 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1970 'id': u'c' + chapter_id,
1973 'title': chapter_info['title'],
1974 'thumbnail': chapter_info['preview'],
1975 'description': chapter_info['description'],
1976 'uploader': chapter_info['channel']['display_name'],
1977 'uploader_id': chapter_info['channel']['name'],
1981 video_id = mobj.group('videoid')
1982 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1984 self.report_extraction(video_id)
1988 limit = self._JUSTIN_PAGE_LIMIT
1991 self.report_download_page(video_id, offset)
1992 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1993 page_count, page_info = self._parse_page(page_url, video_id)
1994 info.extend(page_info)
1995 if not paged or page_count != limit:
2000 class FunnyOrDieIE(InfoExtractor):
2001 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2003 def _real_extract(self, url):
2004 mobj = re.match(self._VALID_URL, url)
2006 raise ExtractorError(u'invalid URL: %s' % url)
2008 video_id = mobj.group('id')
2009 webpage = self._download_webpage(url, video_id)
2011 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2012 webpage, u'video URL', flags=re.DOTALL)
2014 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2015 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2017 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2018 webpage, u'description', fatal=False, flags=re.DOTALL)
2025 'description': video_description,
2029 class SteamIE(InfoExtractor):
2030 _VALID_URL = r"""http://store\.steampowered\.com/
2032 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2034 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2036 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2037 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2040 def suitable(cls, url):
2041 """Receives a URL and returns True if suitable for this IE."""
2042 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2044 def _real_extract(self, url):
2045 m = re.match(self._VALID_URL, url, re.VERBOSE)
2046 gameID = m.group('gameID')
2048 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2049 webpage = self._download_webpage(videourl, gameID)
2051 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2052 videourl = self._AGECHECK_TEMPLATE % gameID
2053 self.report_age_confirmation()
2054 webpage = self._download_webpage(videourl, gameID)
2056 self.report_extraction(gameID)
2057 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2058 webpage, 'game title')
2060 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2061 mweb = re.finditer(urlRE, webpage)
2062 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2063 titles = re.finditer(namesRE, webpage)
2064 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2065 thumbs = re.finditer(thumbsRE, webpage)
2067 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2068 video_id = vid.group('videoID')
2069 title = vtitle.group('videoName')
2070 video_url = vid.group('videoURL')
2071 video_thumb = thumb.group('thumbnail')
2073 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2078 'title': unescapeHTML(title),
2079 'thumbnail': video_thumb
2082 return [self.playlist_result(videos, gameID, game_title)]
2084 class UstreamIE(InfoExtractor):
2085 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2086 IE_NAME = u'ustream'
2088 def _real_extract(self, url):
2089 m = re.match(self._VALID_URL, url)
2090 video_id = m.group('videoID')
2092 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2093 webpage = self._download_webpage(url, video_id)
2095 self.report_extraction(video_id)
2097 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2100 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2101 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2103 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2104 webpage, u'thumbnail', fatal=False)
2110 'title': video_title,
2111 'uploader': uploader,
2112 'thumbnail': thumbnail,
2116 class WorldStarHipHopIE(InfoExtractor):
2117 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2118 IE_NAME = u'WorldStarHipHop'
2120 def _real_extract(self, url):
2121 m = re.match(self._VALID_URL, url)
2122 video_id = m.group('id')
2124 webpage_src = self._download_webpage(url, video_id)
2126 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2127 webpage_src, u'video URL')
2129 if 'mp4' in video_url:
2134 video_title = self._html_search_regex(r"<title>(.*)</title>",
2135 webpage_src, u'title')
2137 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2138 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2139 webpage_src, u'thumbnail', fatal=False)
2142 _title = r"""candytitles.*>(.*)</span>"""
2143 mobj = re.search(_title, webpage_src)
2144 if mobj is not None:
2145 video_title = mobj.group(1)
2150 'title' : video_title,
2151 'thumbnail' : thumbnail,
2156 class RBMARadioIE(InfoExtractor):
2157 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2159 def _real_extract(self, url):
2160 m = re.match(self._VALID_URL, url)
2161 video_id = m.group('videoID')
2163 webpage = self._download_webpage(url, video_id)
2165 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2166 webpage, u'json data', flags=re.MULTILINE)
2169 data = json.loads(json_data)
2170 except ValueError as e:
2171 raise ExtractorError(u'Invalid JSON: ' + str(e))
2173 video_url = data['akamai_url'] + '&cbr=256'
2174 url_parts = compat_urllib_parse_urlparse(video_url)
2175 video_ext = url_parts.path.rpartition('.')[2]
2180 'title': data['title'],
2181 'description': data.get('teaser_text'),
2182 'location': data.get('country_of_origin'),
2183 'uploader': data.get('host', {}).get('name'),
2184 'uploader_id': data.get('host', {}).get('slug'),
2185 'thumbnail': data.get('image', {}).get('large_url_2x'),
2186 'duration': data.get('duration'),
2191 class YouPornIE(InfoExtractor):
2192 """Information extractor for youporn.com."""
2193 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2195 def _print_formats(self, formats):
2196 """Print all available formats"""
2197 print(u'Available formats:')
2198 print(u'ext\t\tformat')
2199 print(u'---------------------------------')
2200 for format in formats:
2201 print(u'%s\t\t%s' % (format['ext'], format['format']))
2203 def _specific(self, req_format, formats):
2205 if(x["format"]==req_format):
2209 def _real_extract(self, url):
2210 mobj = re.match(self._VALID_URL, url)
2212 raise ExtractorError(u'Invalid URL: %s' % url)
2213 video_id = mobj.group('videoid')
2215 req = compat_urllib_request.Request(url)
2216 req.add_header('Cookie', 'age_verified=1')
2217 webpage = self._download_webpage(req, video_id)
2219 # Get JSON parameters
2220 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2222 params = json.loads(json_params)
2224 raise ExtractorError(u'Invalid JSON')
2226 self.report_extraction(video_id)
2228 video_title = params['title']
2229 upload_date = unified_strdate(params['release_date_f'])
2230 video_description = params['description']
2231 video_uploader = params['submitted_by']
2232 thumbnail = params['thumbnails'][0]['image']
2234 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2236 # Get all of the formats available
2237 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2238 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2239 webpage, u'download list').strip()
2241 # Get all of the links from the page
2242 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2243 links = re.findall(LINK_RE, download_list_html)
2244 if(len(links) == 0):
2245 raise ExtractorError(u'ERROR: no known formats available for video')
2247 self.to_screen(u'Links found: %d' % len(links))
2252 # A link looks like this:
2253 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2254 # A path looks like this:
2255 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2256 video_url = unescapeHTML( link )
2257 path = compat_urllib_parse_urlparse( video_url ).path
2258 extension = os.path.splitext( path )[1][1:]
2259 format = path.split('/')[4].split('_')[:2]
2262 format = "-".join( format )
2263 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2268 'uploader': video_uploader,
2269 'upload_date': upload_date,
2270 'title': video_title,
2273 'thumbnail': thumbnail,
2274 'description': video_description
2277 if self._downloader.params.get('listformats', None):
2278 self._print_formats(formats)
2281 req_format = self._downloader.params.get('format', None)
2282 self.to_screen(u'Format: %s' % req_format)
2284 if req_format is None or req_format == 'best':
2286 elif req_format == 'worst':
2287 return [formats[-1]]
2288 elif req_format in ('-1', 'all'):
2291 format = self._specific( req_format, formats )
2293 raise ExtractorError(u'Requested format not available')
2298 class PornotubeIE(InfoExtractor):
2299 """Information extractor for pornotube.com."""
2300 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2302 def _real_extract(self, url):
2303 mobj = re.match(self._VALID_URL, url)
2305 raise ExtractorError(u'Invalid URL: %s' % url)
2307 video_id = mobj.group('videoid')
2308 video_title = mobj.group('title')
2310 # Get webpage content
2311 webpage = self._download_webpage(url, video_id)
2314 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2315 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2316 video_url = compat_urllib_parse.unquote(video_url)
2318 #Get the uploaded date
2319 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2320 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2321 if upload_date: upload_date = unified_strdate(upload_date)
2323 info = {'id': video_id,
2326 'upload_date': upload_date,
2327 'title': video_title,
2333 class YouJizzIE(InfoExtractor):
2334 """Information extractor for youjizz.com."""
2335 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2337 def _real_extract(self, url):
2338 mobj = re.match(self._VALID_URL, url)
2340 raise ExtractorError(u'Invalid URL: %s' % url)
2342 video_id = mobj.group('videoid')
2344 # Get webpage content
2345 webpage = self._download_webpage(url, video_id)
2347 # Get the video title
2348 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2349 webpage, u'title').strip()
2351 # Get the embed page
2352 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2354 raise ExtractorError(u'ERROR: unable to extract embed page')
2356 embed_page_url = result.group(0).strip()
2357 video_id = result.group('videoid')
2359 webpage = self._download_webpage(embed_page_url, video_id)
2362 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2363 webpage, u'video URL')
2365 info = {'id': video_id,
2367 'title': video_title,
2370 'player_url': embed_page_url}
2374 class EightTracksIE(InfoExtractor):
2376 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2378 def _real_extract(self, url):
2379 mobj = re.match(self._VALID_URL, url)
2381 raise ExtractorError(u'Invalid URL: %s' % url)
2382 playlist_id = mobj.group('id')
2384 webpage = self._download_webpage(url, playlist_id)
2386 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2387 data = json.loads(json_like)
2389 session = str(random.randint(0, 1000000000))
2391 track_count = data['tracks_count']
2392 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2393 next_url = first_url
2395 for i in itertools.count():
2396 api_json = self._download_webpage(next_url, playlist_id,
2397 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2398 errnote=u'Failed to download song information')
2399 api_data = json.loads(api_json)
2400 track_data = api_data[u'set']['track']
2402 'id': track_data['id'],
2403 'url': track_data['track_file_stream_url'],
2404 'title': track_data['performer'] + u' - ' + track_data['name'],
2405 'raw_title': track_data['name'],
2406 'uploader_id': data['user']['login'],
2410 if api_data['set']['at_last_track']:
2412 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2415 class KeekIE(InfoExtractor):
2416 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2419 def _real_extract(self, url):
2420 m = re.match(self._VALID_URL, url)
2421 video_id = m.group('videoID')
2423 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2424 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2425 webpage = self._download_webpage(url, video_id)
2427 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2430 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2431 webpage, u'uploader', fatal=False)
2437 'title': video_title,
2438 'thumbnail': thumbnail,
2439 'uploader': uploader
2443 class TEDIE(InfoExtractor):
2444 _VALID_URL=r'''http://www\.ted\.com/
2446 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2448 ((?P<type_talk>talks)) # We have a simple talk
2450 (/lang/(.*?))? # The url may contain the language
2451 /(?P<name>\w+) # Here goes the name and then ".html"
2455 def suitable(cls, url):
2456 """Receives a URL and returns True if suitable for this IE."""
2457 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2459 def _real_extract(self, url):
2460 m=re.match(self._VALID_URL, url, re.VERBOSE)
2461 if m.group('type_talk'):
2462 return [self._talk_info(url)]
2464 playlist_id=m.group('playlist_id')
2465 name=m.group('name')
2466 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2467 return [self._playlist_videos_info(url,name,playlist_id)]
2469 def _playlist_videos_info(self,url,name,playlist_id=0):
2470 '''Returns the videos of the playlist'''
2472 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2473 ([.\s]*?)data-playlist_item_id="(\d+)"
2474 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2476 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2477 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2478 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2479 m_names=re.finditer(video_name_RE,webpage)
2481 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2482 webpage, 'playlist title')
2484 playlist_entries = []
2485 for m_video, m_name in zip(m_videos,m_names):
2486 video_id=m_video.group('video_id')
2487 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2488 playlist_entries.append(self.url_result(talk_url, 'TED'))
2489 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2491 def _talk_info(self, url, video_id=0):
2492 """Return the video for the talk in the url"""
2493 m = re.match(self._VALID_URL, url,re.VERBOSE)
2494 video_name = m.group('name')
2495 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2496 self.report_extraction(video_name)
2497 # If the url includes the language we get the title translated
2498 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2500 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2501 webpage, 'json data')
2502 info = json.loads(json_data)
2503 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2504 webpage, 'description', flags = re.DOTALL)
2506 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2507 webpage, 'thumbnail')
2510 'url': info['htmlStreams'][-1]['file'],
2513 'thumbnail': thumbnail,
2514 'description': desc,
2518 class MySpassIE(InfoExtractor):
2519 _VALID_URL = r'http://www.myspass.de/.*'
2521 def _real_extract(self, url):
2522 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2524 # video id is the last path element of the URL
2525 # usually there is a trailing slash, so also try the second but last
2526 url_path = compat_urllib_parse_urlparse(url).path
2527 url_parent_path, video_id = os.path.split(url_path)
2529 _, video_id = os.path.split(url_parent_path)
2532 metadata_url = META_DATA_URL_TEMPLATE % video_id
2533 metadata_text = self._download_webpage(metadata_url, video_id)
2534 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2536 # extract values from metadata
2537 url_flv_el = metadata.find('url_flv')
2538 if url_flv_el is None:
2539 raise ExtractorError(u'Unable to extract download url')
2540 video_url = url_flv_el.text
2541 extension = os.path.splitext(video_url)[1][1:]
2542 title_el = metadata.find('title')
2543 if title_el is None:
2544 raise ExtractorError(u'Unable to extract title')
2545 title = title_el.text
2546 format_id_el = metadata.find('format_id')
2547 if format_id_el is None:
2550 format = format_id_el.text
2551 description_el = metadata.find('description')
2552 if description_el is not None:
2553 description = description_el.text
2556 imagePreview_el = metadata.find('imagePreview')
2557 if imagePreview_el is not None:
2558 thumbnail = imagePreview_el.text
2567 'thumbnail': thumbnail,
2568 'description': description
2572 class SpiegelIE(InfoExtractor):
2573 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2575 def _real_extract(self, url):
2576 m = re.match(self._VALID_URL, url)
2577 video_id = m.group('videoID')
2579 webpage = self._download_webpage(url, video_id)
2581 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2584 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2585 xml_code = self._download_webpage(xml_url, video_id,
2586 note=u'Downloading XML', errnote=u'Failed to download XML')
2588 idoc = xml.etree.ElementTree.fromstring(xml_code)
2589 last_type = idoc[-1]
2590 filename = last_type.findall('./filename')[0].text
2591 duration = float(last_type.findall('./duration')[0].text)
2593 video_url = 'http://video2.spiegel.de/flash/' + filename
2594 video_ext = filename.rpartition('.')[2]
2599 'title': video_title,
2600 'duration': duration,
2604 class LiveLeakIE(InfoExtractor):
2606 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2607 IE_NAME = u'liveleak'
2609 def _real_extract(self, url):
2610 mobj = re.match(self._VALID_URL, url)
2612 raise ExtractorError(u'Invalid URL: %s' % url)
2614 video_id = mobj.group('video_id')
2616 webpage = self._download_webpage(url, video_id)
2618 video_url = self._search_regex(r'file: "(.*?)",',
2619 webpage, u'video URL')
2621 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2622 webpage, u'title').replace('LiveLeak.com -', '').strip()
2624 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2625 webpage, u'description', fatal=False)
2627 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2628 webpage, u'uploader', fatal=False)
2634 'title': video_title,
2635 'description': video_description,
2636 'uploader': video_uploader
2641 class ARDIE(InfoExtractor):
2642 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2643 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2644 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2646 def _real_extract(self, url):
2647 # determine video id from url
2648 m = re.match(self._VALID_URL, url)
2650 numid = re.search(r'documentId=([0-9]+)', url)
2652 video_id = numid.group(1)
2654 video_id = m.group('video_id')
2656 # determine title and media streams from webpage
2657 html = self._download_webpage(url, video_id)
2658 title = re.search(self._TITLE, html).group('title')
2659 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2661 assert '"fsk"' in html
2662 raise ExtractorError(u'This video is only available after 8:00 pm')
2664 # choose default media type and highest quality for now
2665 stream = max([s for s in streams if int(s["media_type"]) == 0],
2666 key=lambda s: int(s["quality"]))
2668 # there's two possibilities: RTMP stream or HTTP download
2669 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2670 if stream['rtmp_url']:
2671 self.to_screen(u'RTMP download detected')
2672 assert stream['video_url'].startswith('mp4:')
2673 info["url"] = stream["rtmp_url"]
2674 info["play_path"] = stream['video_url']
2676 assert stream["video_url"].endswith('.mp4')
2677 info["url"] = stream["video_url"]
2680 class ZDFIE(InfoExtractor):
2681 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2682 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2683 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2684 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2685 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2687 def _real_extract(self, url):
2688 mobj = re.match(self._VALID_URL, url)
2690 raise ExtractorError(u'Invalid URL: %s' % url)
2691 video_id = mobj.group('video_id')
2693 html = self._download_webpage(url, video_id)
2694 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2696 raise ExtractorError(u'No media url found.')
2698 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2699 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2700 # choose first/default media type and highest quality for now
2701 for s in streams: #find 300 - dsl1000mbit
2702 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2705 for s in streams: #find veryhigh - dsl2000mbit
2706 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2710 raise ExtractorError(u'No stream found.')
2712 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2714 self.report_extraction(video_id)
2715 mobj = re.search(self._TITLE, html)
2717 raise ExtractorError(u'Cannot extract title')
2718 title = unescapeHTML(mobj.group('title'))
2720 mobj = re.search(self._MMS_STREAM, media_link)
2722 mobj = re.search(self._RTSP_STREAM, media_link)
2724 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2725 mms_url = mobj.group('video_url')
2727 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2729 raise ExtractorError(u'Cannot extract extention')
2730 ext = mobj.group('ext')
2732 return [{'id': video_id,
2738 class TumblrIE(InfoExtractor):
2739 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2741 def _real_extract(self, url):
2742 m_url = re.match(self._VALID_URL, url)
2743 video_id = m_url.group('id')
2744 blog = m_url.group('blog_name')
2746 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2747 webpage = self._download_webpage(url, video_id)
2749 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2750 video = re.search(re_video, webpage)
2752 raise ExtractorError(u'Unable to extract video')
2753 video_url = video.group('video_url')
2754 ext = video.group('ext')
2756 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2757 webpage, u'thumbnail', fatal=False) # We pick the first poster
2758 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2760 # The only place where you can get a title, it's not complete,
2761 # but searching in other places doesn't work for all videos
2762 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2763 webpage, u'title', flags=re.DOTALL)
2765 return [{'id': video_id,
2767 'title': video_title,
2768 'thumbnail': video_thumbnail,
2772 class BandcampIE(InfoExtractor):
2773 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2775 def _real_extract(self, url):
2776 mobj = re.match(self._VALID_URL, url)
2777 title = mobj.group('title')
2778 webpage = self._download_webpage(url, title)
2779 # We get the link to the free download page
2780 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2781 if m_download is None:
2782 raise ExtractorError(u'No free songs found')
2784 download_link = m_download.group(1)
2785 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2786 webpage, re.MULTILINE|re.DOTALL).group('id')
2788 download_webpage = self._download_webpage(download_link, id,
2789 'Downloading free downloads page')
2790 # We get the dictionary of the track from some javascrip code
2791 info = re.search(r'items: (.*?),$',
2792 download_webpage, re.MULTILINE).group(1)
2793 info = json.loads(info)[0]
2794 # We pick mp3-320 for now, until format selection can be easily implemented.
2795 mp3_info = info[u'downloads'][u'mp3-320']
2796 # If we try to use this url it says the link has expired
2797 initial_url = mp3_info[u'url']
2798 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2799 m_url = re.match(re_url, initial_url)
2800 #We build the url we will use to get the final track url
2801 # This url is build in Bandcamp in the script download_bunde_*.js
2802 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2803 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2804 # If we could correctly generate the .rand field the url would be
2805 #in the "download_url" key
2806 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2808 track_info = {'id':id,
2809 'title' : info[u'title'],
2812 'thumbnail' : info[u'thumb_url'],
2813 'uploader' : info[u'artist']
2818 class RedTubeIE(InfoExtractor):
2819 """Information Extractor for redtube"""
2820 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2822 def _real_extract(self,url):
2823 mobj = re.match(self._VALID_URL, url)
2825 raise ExtractorError(u'Invalid URL: %s' % url)
2827 video_id = mobj.group('id')
2828 video_extension = 'mp4'
2829 webpage = self._download_webpage(url, video_id)
2831 self.report_extraction(video_id)
2833 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2834 webpage, u'video URL')
2836 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2842 'ext': video_extension,
2843 'title': video_title,
2846 class InaIE(InfoExtractor):
2847 """Information Extractor for Ina.fr"""
2848 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2850 def _real_extract(self,url):
2851 mobj = re.match(self._VALID_URL, url)
2853 video_id = mobj.group('id')
2854 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2855 video_extension = 'mp4'
2856 webpage = self._download_webpage(mrss_url, video_id)
2858 self.report_extraction(video_id)
2860 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2861 webpage, u'video URL')
2863 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2869 'ext': video_extension,
2870 'title': video_title,
2873 class HowcastIE(InfoExtractor):
2874 """Information Extractor for Howcast.com"""
2875 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2877 def _real_extract(self, url):
2878 mobj = re.match(self._VALID_URL, url)
2880 video_id = mobj.group('id')
2881 webpage_url = 'http://www.howcast.com/videos/' + video_id
2882 webpage = self._download_webpage(webpage_url, video_id)
2884 self.report_extraction(video_id)
2886 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2887 webpage, u'video URL')
2889 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2892 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2893 webpage, u'description', fatal=False)
2895 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2896 webpage, u'thumbnail', fatal=False)
2902 'title': video_title,
2903 'description': video_description,
2904 'thumbnail': thumbnail,
2907 class VineIE(InfoExtractor):
2908 """Information Extractor for Vine.co"""
2909 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2911 def _real_extract(self, url):
2912 mobj = re.match(self._VALID_URL, url)
2914 video_id = mobj.group('id')
2915 webpage_url = 'https://vine.co/v/' + video_id
2916 webpage = self._download_webpage(webpage_url, video_id)
2918 self.report_extraction(video_id)
2920 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2921 webpage, u'video URL')
2923 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2926 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2927 webpage, u'thumbnail', fatal=False)
2929 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2930 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2936 'title': video_title,
2937 'thumbnail': thumbnail,
2938 'uploader': uploader,
2941 class FlickrIE(InfoExtractor):
2942 """Information Extractor for Flickr videos"""
2943 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2945 def _real_extract(self, url):
2946 mobj = re.match(self._VALID_URL, url)
2948 video_id = mobj.group('id')
2949 video_uploader_id = mobj.group('uploader_id')
2950 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2951 webpage = self._download_webpage(webpage_url, video_id)
2953 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2955 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2956 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2958 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2959 first_xml, u'node_id')
2961 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2962 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2964 self.report_extraction(video_id)
2966 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2968 raise ExtractorError(u'Unable to extract video url')
2969 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2971 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2972 webpage, u'video title')
2974 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2975 webpage, u'description', fatal=False)
2977 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2978 webpage, u'thumbnail', fatal=False)
2984 'title': video_title,
2985 'description': video_description,
2986 'thumbnail': thumbnail,
2987 'uploader_id': video_uploader_id,
2990 class TeamcocoIE(InfoExtractor):
2991 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2993 def _real_extract(self, url):
2994 mobj = re.match(self._VALID_URL, url)
2996 raise ExtractorError(u'Invalid URL: %s' % url)
2997 url_title = mobj.group('url_title')
2998 webpage = self._download_webpage(url, url_title)
3000 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3001 webpage, u'video id')
3003 self.report_extraction(video_id)
3005 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3008 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3009 webpage, u'thumbnail', fatal=False)
3011 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3012 webpage, u'description', fatal=False)
3014 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3015 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3017 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3024 'title': video_title,
3025 'thumbnail': thumbnail,
3026 'description': video_description,
3029 class XHamsterIE(InfoExtractor):
3030 """Information Extractor for xHamster"""
3031 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3033 def _real_extract(self,url):
3034 mobj = re.match(self._VALID_URL, url)
3036 video_id = mobj.group('id')
3037 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3038 webpage = self._download_webpage(mrss_url, video_id)
3040 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3042 raise ExtractorError(u'Unable to extract media URL')
3043 if len(mobj.group('server')) == 0:
3044 video_url = compat_urllib_parse.unquote(mobj.group('file'))
3046 video_url = mobj.group('server')+'/key='+mobj.group('file')
3047 video_extension = video_url.split('.')[-1]
3049 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3052 # Can't see the description anywhere in the UI
3053 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3054 # webpage, u'description', fatal=False)
3055 # if video_description: video_description = unescapeHTML(video_description)
3057 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3059 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3061 video_upload_date = None
3062 self._downloader.report_warning(u'Unable to extract upload date')
3064 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3065 webpage, u'uploader id', default=u'anonymous')
3067 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3068 webpage, u'thumbnail', fatal=False)
3073 'ext': video_extension,
3074 'title': video_title,
3075 # 'description': video_description,
3076 'upload_date': video_upload_date,
3077 'uploader_id': video_uploader_id,
3078 'thumbnail': video_thumbnail
3081 class HypemIE(InfoExtractor):
3082 """Information Extractor for hypem"""
3083 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3085 def _real_extract(self, url):
3086 mobj = re.match(self._VALID_URL, url)
3088 raise ExtractorError(u'Invalid URL: %s' % url)
3089 track_id = mobj.group(1)
3091 data = { 'ax': 1, 'ts': time.time() }
3092 data_encoded = compat_urllib_parse.urlencode(data)
3093 complete_url = url + "?" + data_encoded
3094 request = compat_urllib_request.Request(complete_url)
3095 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3096 cookie = urlh.headers.get('Set-Cookie', '')
3098 self.report_extraction(track_id)
3100 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3101 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3103 track_list = json.loads(html_tracks)
3104 track = track_list[u'tracks'][0]
3106 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3109 track_id = track[u"id"]
3110 artist = track[u"artist"]
3111 title = track[u"song"]
3113 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3114 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3115 request.add_header('cookie', cookie)
3116 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3118 song_data = json.loads(song_data_json)
3120 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3121 final_url = song_data[u"url"]
3131 class Vbox7IE(InfoExtractor):
3132 """Information Extractor for Vbox7"""
3133 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3135 def _real_extract(self,url):
3136 mobj = re.match(self._VALID_URL, url)
3138 raise ExtractorError(u'Invalid URL: %s' % url)
3139 video_id = mobj.group(1)
3141 redirect_page, urlh = self._download_webpage_handle(url, video_id)
3142 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3143 redirect_url = urlh.geturl() + new_location
3144 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3146 title = self._html_search_regex(r'<title>(.*)</title>',
3147 webpage, u'title').split('/')[0].strip()
3150 info_url = "http://vbox7.com/play/magare.do"
3151 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3152 info_request = compat_urllib_request.Request(info_url, data)
3153 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3154 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3155 if info_response is None:
3156 raise ExtractorError(u'Unable to extract the media url')
3157 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3164 'thumbnail': thumbnail_url,
3167 class GametrailersIE(InfoExtractor):
3168 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3170 def _real_extract(self, url):
3171 mobj = re.match(self._VALID_URL, url)
3173 raise ExtractorError(u'Invalid URL: %s' % url)
3174 video_id = mobj.group('id')
3175 video_type = mobj.group('type')
3176 webpage = self._download_webpage(url, video_id)
3177 if video_type == 'full-episodes':
3178 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3180 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3181 mgid = self._search_regex(mgid_re, webpage, u'mgid')
3182 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3184 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3185 video_id, u'Downloading video info')
3186 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3187 video_id, u'Downloading video urls info')
3189 self.report_extraction(video_id)
3190 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3191 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3193 <url>(?P<thumb>.*?)</url>.*
3196 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3198 raise ExtractorError(u'Unable to extract video info')
3199 video_title = m_info.group('title')
3200 video_description = m_info.group('description')
3201 video_thumb = m_info.group('thumb')
3203 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3204 if m_urls is None or len(m_urls) == 0:
3205 raise ExtractError(u'Unable to extrat video url')
3206 # They are sorted from worst to best quality
3207 video_url = m_urls[-1].group('url')
3209 return {'url': video_url,
3211 'title': video_title,
3212 # Videos are actually flv not mp4
3214 'thumbnail': video_thumb,
3215 'description': video_description,
3218 def gen_extractors():
3219 """ Return a list of an instance of every supported extractor.
3220 The order does matter; the first extractor matched is the one handling the URL.
3223 YoutubePlaylistIE(),
3248 StanfordOpenClassroomIE(),
3258 WorldStarHipHopIE(),
3288 def get_info_extractor(ie_name):
3289 """Returns the info extractor class with the given ie_name"""
3290 return globals()[ie_name+'IE']