2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.metacafe import MetacafeIE
29 from .extractor.statigram import StatigramIE
30 from .extractor.photobucket import PhotobucketIE
31 from .extractor.yahoo import YahooIE
32 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
40 class VimeoIE(InfoExtractor):
41 """Information extractor for vimeo.com."""
43 # _VALID_URL matches Vimeo URLs
44 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
47 def _verify_video_password(self, url, video_id, webpage):
48 password = self._downloader.params.get('password', None)
50 raise ExtractorError(u'This video is protected by a password, use the --password option')
51 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
52 data = compat_urllib_parse.urlencode({'password': password,
54 # I didn't manage to use the password with https
55 if url.startswith('https'):
56 pass_url = url.replace('https','http')
59 password_request = compat_urllib_request.Request(pass_url+'/password', data)
60 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
61 password_request.add_header('Cookie', 'xsrft=%s' % token)
62 pass_web = self._download_webpage(password_request, video_id,
63 u'Verifying the password',
66 def _real_extract(self, url, new_video=True):
68 mobj = re.match(self._VALID_URL, url)
70 raise ExtractorError(u'Invalid URL: %s' % url)
72 video_id = mobj.group('id')
73 if not mobj.group('proto'):
74 url = 'https://' + url
75 if mobj.group('direct_link') or mobj.group('pro'):
76 url = 'https://vimeo.com/' + video_id
78 # Retrieve video webpage to extract further information
79 request = compat_urllib_request.Request(url, None, std_headers)
80 webpage = self._download_webpage(request, video_id)
82 # Now we begin extracting as much information as we can from what we
83 # retrieved. First we extract the information common to all extractors,
84 # and latter we extract those that are Vimeo specific.
85 self.report_extraction(video_id)
87 # Extract the config JSON
89 config = webpage.split(' = {config:')[1].split(',assets:')[0]
90 config = json.loads(config)
92 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
93 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
95 if re.search('If so please provide the correct password.', webpage):
96 self._verify_video_password(url, video_id, webpage)
97 return self._real_extract(url)
99 raise ExtractorError(u'Unable to extract info section')
102 video_title = config["video"]["title"]
104 # Extract uploader and uploader_id
105 video_uploader = config["video"]["owner"]["name"]
106 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
108 # Extract video thumbnail
109 video_thumbnail = config["video"]["thumbnail"]
111 # Extract video description
112 video_description = get_element_by_attribute("itemprop", "description", webpage)
113 if video_description: video_description = clean_html(video_description)
114 else: video_description = u''
116 # Extract upload date
117 video_upload_date = None
118 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
120 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
122 # Vimeo specific: extract request signature and timestamp
123 sig = config['request']['signature']
124 timestamp = config['request']['timestamp']
126 # Vimeo specific: extract video codec and quality information
127 # First consider quality, then codecs, then take everything
128 # TODO bind to format param
129 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
130 files = { 'hd': [], 'sd': [], 'other': []}
131 for codec_name, codec_extension in codecs:
132 if codec_name in config["video"]["files"]:
133 if 'hd' in config["video"]["files"][codec_name]:
134 files['hd'].append((codec_name, codec_extension, 'hd'))
135 elif 'sd' in config["video"]["files"][codec_name]:
136 files['sd'].append((codec_name, codec_extension, 'sd'))
138 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
140 for quality in ('hd', 'sd', 'other'):
141 if len(files[quality]) > 0:
142 video_quality = files[quality][0][2]
143 video_codec = files[quality][0][0]
144 video_extension = files[quality][0][1]
145 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
148 raise ExtractorError(u'No known codec found')
150 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
151 %(video_id, sig, timestamp, video_quality, video_codec.upper())
156 'uploader': video_uploader,
157 'uploader_id': video_uploader_id,
158 'upload_date': video_upload_date,
159 'title': video_title,
160 'ext': video_extension,
161 'thumbnail': video_thumbnail,
162 'description': video_description,
166 class ArteTvIE(InfoExtractor):
167 """arte.tv information extractor."""
169 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
170 _LIVE_URL = r'index-[0-9]+\.html$'
174 def fetch_webpage(self, url):
175 request = compat_urllib_request.Request(url)
177 self.report_download_webpage(url)
178 webpage = compat_urllib_request.urlopen(request).read()
179 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
180 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
181 except ValueError as err:
182 raise ExtractorError(u'Invalid URL: %s' % url)
185 def grep_webpage(self, url, regex, regexFlags, matchTuples):
186 page = self.fetch_webpage(url)
187 mobj = re.search(regex, page, regexFlags)
191 raise ExtractorError(u'Invalid URL: %s' % url)
193 for (i, key, err) in matchTuples:
194 if mobj.group(i) is None:
195 raise ExtractorError(err)
197 info[key] = mobj.group(i)
201 def extractLiveStream(self, url):
202 video_lang = url.split('/')[-4]
203 info = self.grep_webpage(
205 r'src="(.*?/videothek_js.*?\.js)',
208 (1, 'url', u'Invalid URL: %s' % url)
211 http_host = url.split('/')[2]
212 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
213 info = self.grep_webpage(
215 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
216 '(http://.*?\.swf).*?' +
220 (1, 'path', u'could not extract video path: %s' % url),
221 (2, 'player', u'could not extract video player: %s' % url),
222 (3, 'url', u'could not extract video url: %s' % url)
225 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
227 def extractPlus7Stream(self, url):
228 video_lang = url.split('/')[-3]
229 info = self.grep_webpage(
231 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
234 (1, 'url', u'Invalid URL: %s' % url)
237 next_url = compat_urllib_parse.unquote(info.get('url'))
238 info = self.grep_webpage(
240 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
243 (1, 'url', u'Could not find <video> tag: %s' % url)
246 next_url = compat_urllib_parse.unquote(info.get('url'))
248 info = self.grep_webpage(
250 r'<video id="(.*?)".*?>.*?' +
251 '<name>(.*?)</name>.*?' +
252 '<dateVideo>(.*?)</dateVideo>.*?' +
253 '<url quality="hd">(.*?)</url>',
256 (1, 'id', u'could not extract video id: %s' % url),
257 (2, 'title', u'could not extract video title: %s' % url),
258 (3, 'date', u'could not extract video date: %s' % url),
259 (4, 'url', u'could not extract video url: %s' % url)
264 'id': info.get('id'),
265 'url': compat_urllib_parse.unquote(info.get('url')),
266 'uploader': u'arte.tv',
267 'upload_date': unified_strdate(info.get('date')),
268 'title': info.get('title').decode('utf-8'),
274 def _real_extract(self, url):
275 video_id = url.split('/')[-1]
276 self.report_extraction(video_id)
278 if re.search(self._LIVE_URL, video_id) is not None:
279 self.extractLiveStream(url)
282 info = self.extractPlus7Stream(url)
287 class GenericIE(InfoExtractor):
288 """Generic last-resort information extractor."""
293 def report_download_webpage(self, video_id):
294 """Report webpage download."""
295 if not self._downloader.params.get('test', False):
296 self._downloader.report_warning(u'Falling back on generic information extractor.')
297 super(GenericIE, self).report_download_webpage(video_id)
299 def report_following_redirect(self, new_url):
300 """Report information extraction."""
301 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
303 def _test_redirect(self, url):
304 """Check if it is a redirect, like url shorteners, in case return the new url."""
305 class HeadRequest(compat_urllib_request.Request):
306 def get_method(self):
309 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
311 Subclass the HTTPRedirectHandler to make it use our
312 HeadRequest also on the redirected URL
314 def redirect_request(self, req, fp, code, msg, headers, newurl):
315 if code in (301, 302, 303, 307):
316 newurl = newurl.replace(' ', '%20')
317 newheaders = dict((k,v) for k,v in req.headers.items()
318 if k.lower() not in ("content-length", "content-type"))
319 return HeadRequest(newurl,
321 origin_req_host=req.get_origin_req_host(),
324 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
326 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
328 Fallback to GET if HEAD is not allowed (405 HTTP error)
330 def http_error_405(self, req, fp, code, msg, headers):
334 newheaders = dict((k,v) for k,v in req.headers.items()
335 if k.lower() not in ("content-length", "content-type"))
336 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
338 origin_req_host=req.get_origin_req_host(),
342 opener = compat_urllib_request.OpenerDirector()
343 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
344 HTTPMethodFallback, HEADRedirectHandler,
345 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
346 opener.add_handler(handler())
348 response = opener.open(HeadRequest(url))
350 raise ExtractorError(u'Invalid URL protocol')
351 new_url = response.geturl()
356 self.report_following_redirect(new_url)
359 def _real_extract(self, url):
360 new_url = self._test_redirect(url)
361 if new_url: return [self.url_result(new_url)]
363 video_id = url.split('/')[-1]
365 webpage = self._download_webpage(url, video_id)
366 except ValueError as err:
367 # since this is the last-resort InfoExtractor, if
368 # this error is thrown, it'll be thrown here
369 raise ExtractorError(u'Invalid URL: %s' % url)
371 self.report_extraction(video_id)
372 # Start with something easy: JW Player in SWFObject
373 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
375 # Broaden the search a little bit
376 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
378 # Broaden the search a little bit: JWPlayer JS loader
379 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
381 # Try to find twitter cards info
382 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
384 # We look for Open Graph info:
385 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
386 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
387 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
388 if m_video_type is not None:
389 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
391 raise ExtractorError(u'Invalid URL: %s' % url)
393 # It's possible that one of the regexes
394 # matched, but returned an empty group:
395 if mobj.group(1) is None:
396 raise ExtractorError(u'Invalid URL: %s' % url)
398 video_url = compat_urllib_parse.unquote(mobj.group(1))
399 video_id = os.path.basename(video_url)
401 # here's a fun little line of code for you:
402 video_extension = os.path.splitext(video_id)[1][1:]
403 video_id = os.path.splitext(video_id)[0]
405 # it's tempting to parse this further, but you would
406 # have to take into account all the variations like
407 # Video Title - Site Name
408 # Site Name | Video Title
409 # Video Title - Tagline | Site Name
410 # and so on and so forth; it's just not practical
411 video_title = self._html_search_regex(r'<title>(.*)</title>',
412 webpage, u'video title')
414 # video uploader is domain name
415 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
416 url, u'video uploader')
421 'uploader': video_uploader,
423 'title': video_title,
424 'ext': video_extension,
428 class YoutubeSearchIE(SearchInfoExtractor):
429 """Information Extractor for YouTube search queries."""
430 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
432 IE_NAME = u'youtube:search'
433 _SEARCH_KEY = 'ytsearch'
435 def report_download_page(self, query, pagenum):
436 """Report attempt to download search page with given number."""
437 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
439 def _get_n_results(self, query, n):
440 """Get a specified number of results for a query"""
446 while (50 * pagenum) < limit:
447 self.report_download_page(query, pagenum+1)
448 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
449 request = compat_urllib_request.Request(result_url)
451 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
454 api_response = json.loads(data)['data']
456 if not 'items' in api_response:
457 raise ExtractorError(u'[youtube] No video results')
459 new_ids = list(video['id'] for video in api_response['items'])
462 limit = min(n, api_response['totalItems'])
465 if len(video_ids) > n:
466 video_ids = video_ids[:n]
467 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
468 return self.playlist_result(videos, query)
471 class GoogleSearchIE(SearchInfoExtractor):
472 """Information Extractor for Google Video search queries."""
473 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
475 IE_NAME = u'video.google:search'
476 _SEARCH_KEY = 'gvsearch'
478 def _get_n_results(self, query, n):
479 """Get a specified number of results for a query"""
487 for pagenum in itertools.count(1):
488 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
489 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
490 note='Downloading result page ' + str(pagenum))
492 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
497 res['entries'].append(e)
499 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
502 class YahooSearchIE(SearchInfoExtractor):
503 """Information Extractor for Yahoo! Video search queries."""
506 IE_NAME = u'screen.yahoo:search'
507 _SEARCH_KEY = 'yvsearch'
509 def _get_n_results(self, query, n):
510 """Get a specified number of results for a query"""
517 for pagenum in itertools.count(0):
518 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
519 webpage = self._download_webpage(result_url, query,
520 note='Downloading results page '+str(pagenum+1))
521 info = json.loads(webpage)
523 results = info[u'results']
525 for (i, r) in enumerate(results):
526 if (pagenum * 30) +i >= n:
528 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
529 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
530 res['entries'].append(e)
531 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
537 class BlipTVUserIE(InfoExtractor):
538 """Information Extractor for blip.tv users."""
540 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
542 IE_NAME = u'blip.tv:user'
544 def _real_extract(self, url):
546 mobj = re.match(self._VALID_URL, url)
548 raise ExtractorError(u'Invalid URL: %s' % url)
550 username = mobj.group(1)
552 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
554 page = self._download_webpage(url, username, u'Downloading user page')
555 mobj = re.search(r'data-users-id="([^"]+)"', page)
556 page_base = page_base % mobj.group(1)
559 # Download video ids using BlipTV Ajax calls. Result size per
560 # query is limited (currently to 12 videos) so we need to query
561 # page by page until there are no video ids - it means we got
568 url = page_base + "&page=" + str(pagenum)
569 page = self._download_webpage(url, username,
570 u'Downloading video ids from page %d' % pagenum)
572 # Extract video identifiers
575 for mobj in re.finditer(r'href="/([^"]+)"', page):
576 if mobj.group(1) not in ids_in_page:
577 ids_in_page.append(unescapeHTML(mobj.group(1)))
579 video_ids.extend(ids_in_page)
581 # A little optimization - if current page is not
582 # "full", ie. does not contain PAGE_SIZE video ids then
583 # we can assume that this page is the last one - there
584 # are no more ids on further pages - no need to query
587 if len(ids_in_page) < self._PAGE_SIZE:
592 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
593 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
594 return [self.playlist_result(url_entries, playlist_title = username)]
597 class DepositFilesIE(InfoExtractor):
598 """Information extractor for depositfiles.com"""
600 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
602 def _real_extract(self, url):
603 file_id = url.split('/')[-1]
604 # Rebuild url in english locale
605 url = 'http://depositfiles.com/en/files/' + file_id
607 # Retrieve file webpage with 'Free download' button pressed
608 free_download_indication = { 'gateway_result' : '1' }
609 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
611 self.report_download_webpage(file_id)
612 webpage = compat_urllib_request.urlopen(request).read()
613 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
614 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
616 # Search for the real file URL
617 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
618 if (mobj is None) or (mobj.group(1) is None):
619 # Try to figure out reason of the error.
620 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
621 if (mobj is not None) and (mobj.group(1) is not None):
622 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
623 raise ExtractorError(u'%s' % restriction_message)
625 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
627 file_url = mobj.group(1)
628 file_extension = os.path.splitext(file_url)[1][1:]
630 # Search for file title
631 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
634 'id': file_id.decode('utf-8'),
635 'url': file_url.decode('utf-8'),
639 'ext': file_extension.decode('utf-8'),
643 class FacebookIE(InfoExtractor):
644 """Information Extractor for Facebook"""
646 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
647 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
648 _NETRC_MACHINE = 'facebook'
649 IE_NAME = u'facebook'
651 def report_login(self):
652 """Report attempt to log in."""
653 self.to_screen(u'Logging in')
655 def _real_initialize(self):
656 if self._downloader is None:
661 downloader_params = self._downloader.params
663 # Attempt to use provided username and password or .netrc data
664 if downloader_params.get('username', None) is not None:
665 useremail = downloader_params['username']
666 password = downloader_params['password']
667 elif downloader_params.get('usenetrc', False):
669 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
674 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
675 except (IOError, netrc.NetrcParseError) as err:
676 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
679 if useremail is None:
688 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
691 login_results = compat_urllib_request.urlopen(request).read()
692 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
693 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
695 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
696 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
699 def _real_extract(self, url):
700 mobj = re.match(self._VALID_URL, url)
702 raise ExtractorError(u'Invalid URL: %s' % url)
703 video_id = mobj.group('ID')
705 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
706 webpage = self._download_webpage(url, video_id)
708 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
709 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
710 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
712 raise ExtractorError(u'Cannot parse data')
713 data = dict(json.loads(m.group(1)))
714 params_raw = compat_urllib_parse.unquote(data['params'])
715 params = json.loads(params_raw)
716 video_data = params['video_data'][0]
717 video_url = video_data.get('hd_src')
719 video_url = video_data['sd_src']
721 raise ExtractorError(u'Cannot find video URL')
722 video_duration = int(video_data['video_duration'])
723 thumbnail = video_data['thumbnail_src']
725 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
730 'title': video_title,
733 'duration': video_duration,
734 'thumbnail': thumbnail,
739 class BlipTVIE(InfoExtractor):
740 """Information extractor for blip.tv"""
742 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
743 _URL_EXT = r'^.*\.([a-z0-9]+)$'
746 def report_direct_download(self, title):
747 """Report information extraction."""
748 self.to_screen(u'%s: Direct download detected' % title)
750 def _real_extract(self, url):
751 mobj = re.match(self._VALID_URL, url)
753 raise ExtractorError(u'Invalid URL: %s' % url)
755 # See https://github.com/rg3/youtube-dl/issues/857
756 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
757 if api_mobj is not None:
758 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
759 urlp = compat_urllib_parse_urlparse(url)
760 if urlp.path.startswith('/play/'):
761 request = compat_urllib_request.Request(url)
762 response = compat_urllib_request.urlopen(request)
763 redirecturl = response.geturl()
764 rurlp = compat_urllib_parse_urlparse(redirecturl)
765 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
766 url = 'http://blip.tv/a/a-' + file_id
767 return self._real_extract(url)
774 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
775 request = compat_urllib_request.Request(json_url)
776 request.add_header('User-Agent', 'iTunes/10.6.1')
777 self.report_extraction(mobj.group(1))
780 urlh = compat_urllib_request.urlopen(request)
781 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
782 basename = url.split('/')[-1]
783 title,ext = os.path.splitext(basename)
784 title = title.decode('UTF-8')
785 ext = ext.replace('.', '')
786 self.report_direct_download(title)
796 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
797 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
798 if info is None: # Regular URL
800 json_code_bytes = urlh.read()
801 json_code = json_code_bytes.decode('utf-8')
802 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
803 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
806 json_data = json.loads(json_code)
807 if 'Post' in json_data:
808 data = json_data['Post']
812 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
813 video_url = data['media']['url']
814 umobj = re.match(self._URL_EXT, video_url)
816 raise ValueError('Can not determine filename extension')
820 'id': data['item_id'],
822 'uploader': data['display_name'],
823 'upload_date': upload_date,
824 'title': data['title'],
826 'format': data['media']['mimeType'],
827 'thumbnail': data['thumbnailUrl'],
828 'description': data['description'],
829 'player_url': data['embedUrl'],
830 'user_agent': 'iTunes/10.6.1',
832 except (ValueError,KeyError) as err:
833 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
838 class MyVideoIE(InfoExtractor):
839 """Information Extractor for myvideo.de."""
841 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
844 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
845 # Released into the Public Domain by Tristan Fischer on 2013-05-19
846 # https://github.com/rg3/youtube-dl/pull/842
847 def __rc4crypt(self,data, key):
849 box = list(range(256))
850 for i in list(range(256)):
851 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
852 box[i], box[x] = box[x], box[i]
858 y = (y + box[x]) % 256
859 box[x], box[y] = box[y], box[x]
860 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
864 return hashlib.md5(s).hexdigest().encode()
866 def _real_extract(self,url):
867 mobj = re.match(self._VALID_URL, url)
869 raise ExtractorError(u'invalid URL: %s' % url)
871 video_id = mobj.group(1)
874 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
875 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
876 b'TnpsbA0KTVRkbU1tSTRNdz09'
880 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
881 webpage = self._download_webpage(webpage_url, video_id)
883 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
885 self.report_extraction(video_id)
886 video_url = mobj.group(1) + '.flv'
888 video_title = self._html_search_regex('<title>([^<]+)</title>',
891 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
898 'title': video_title,
903 mobj = re.search('var flashvars={(.+?)}', webpage)
905 raise ExtractorError(u'Unable to extract video')
910 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
911 if not a == '_encxml':
914 encxml = compat_urllib_parse.unquote(b)
915 if not params.get('domain'):
916 params['domain'] = 'www.myvideo.de'
917 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
918 if 'flash_playertype=MTV' in xmldata_url:
919 self._downloader.report_warning(u'avoiding MTV player')
921 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
922 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
926 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
927 enc_data_b = binascii.unhexlify(enc_data)
929 base64.b64decode(base64.b64decode(GK)) +
931 str(video_id).encode('utf-8')
934 dec_data = self.__rc4crypt(enc_data_b, sk)
937 self.report_extraction(video_id)
940 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
942 video_url = compat_urllib_parse.unquote(mobj.group(1))
943 if 'myvideo2flash' in video_url:
944 self._downloader.report_warning(u'forcing RTMPT ...')
945 video_url = video_url.replace('rtmpe://', 'rtmpt://')
948 # extract non rtmp videos
949 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
951 raise ExtractorError(u'unable to extract url')
952 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
954 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
955 video_file = compat_urllib_parse.unquote(video_file)
957 if not video_file.endswith('f4m'):
958 ppath, prefix = video_file.split('.')
959 video_playpath = '%s:%s' % (prefix, ppath)
960 video_hls_playlist = ''
963 video_hls_playlist = (
964 video_filepath + video_file
965 ).replace('.f4m', '.m3u8')
967 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
968 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
970 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
979 'title': video_title,
981 'play_path': video_playpath,
982 'video_file': video_file,
983 'video_hls_playlist': video_hls_playlist,
984 'player_url': video_swfobj,
988 class ComedyCentralIE(InfoExtractor):
989 """Information extractor for The Daily Show and Colbert Report """
991 # urls can be abbreviations like :thedailyshow or :colbert
992 # urls for episodes like:
993 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
994 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
995 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
996 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
997 |(https?://)?(www\.)?
998 (?P<showname>thedailyshow|colbertnation)\.com/
999 (full-episodes/(?P<episode>.*)|
1001 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1002 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1005 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1007 _video_extensions = {
1015 _video_dimensions = {
1025 def suitable(cls, url):
1026 """Receives a URL and returns True if suitable for this IE."""
1027 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1029 def _print_formats(self, formats):
1030 print('Available formats:')
1032 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1035 def _real_extract(self, url):
1036 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1038 raise ExtractorError(u'Invalid URL: %s' % url)
1040 if mobj.group('shortname'):
1041 if mobj.group('shortname') in ('tds', 'thedailyshow'):
1042 url = u'http://www.thedailyshow.com/full-episodes/'
1044 url = u'http://www.colbertnation.com/full-episodes/'
1045 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1046 assert mobj is not None
1048 if mobj.group('clip'):
1049 if mobj.group('showname') == 'thedailyshow':
1050 epTitle = mobj.group('tdstitle')
1052 epTitle = mobj.group('cntitle')
1055 dlNewest = not mobj.group('episode')
1057 epTitle = mobj.group('showname')
1059 epTitle = mobj.group('episode')
1061 self.report_extraction(epTitle)
1062 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1064 url = htmlHandle.geturl()
1065 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1067 raise ExtractorError(u'Invalid redirected URL: ' + url)
1068 if mobj.group('episode') == '':
1069 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1070 epTitle = mobj.group('episode')
1072 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1074 if len(mMovieParams) == 0:
1075 # The Colbert Report embeds the information in a without
1076 # a URL prefix; so extract the alternate reference
1077 # and then add the URL prefix manually.
1079 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1080 if len(altMovieParams) == 0:
1081 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1083 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1085 uri = mMovieParams[0][1]
1086 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1087 indexXml = self._download_webpage(indexUrl, epTitle,
1088 u'Downloading show index',
1089 u'unable to download episode index')
1093 idoc = xml.etree.ElementTree.fromstring(indexXml)
1094 itemEls = idoc.findall('.//item')
1095 for partNum,itemEl in enumerate(itemEls):
1096 mediaId = itemEl.findall('./guid')[0].text
1097 shortMediaId = mediaId.split(':')[-1]
1098 showId = mediaId.split(':')[-2].replace('.com', '')
1099 officialTitle = itemEl.findall('./title')[0].text
1100 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1102 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1103 compat_urllib_parse.urlencode({'uri': mediaId}))
1104 configXml = self._download_webpage(configUrl, epTitle,
1105 u'Downloading configuration for %s' % shortMediaId)
1107 cdoc = xml.etree.ElementTree.fromstring(configXml)
1109 for rendition in cdoc.findall('.//rendition'):
1110 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1114 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1117 if self._downloader.params.get('listformats', None):
1118 self._print_formats([i[0] for i in turls])
1121 # For now, just pick the highest bitrate
1122 format,rtmp_video_url = turls[-1]
1124 # Get the format arg from the arg stream
1125 req_format = self._downloader.params.get('format', None)
1127 # Select format if we can find one
1130 format, rtmp_video_url = f, v
1133 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1135 raise ExtractorError(u'Cannot transform RTMP url')
1136 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1137 video_url = base + m.group('finalid')
1139 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1144 'upload_date': officialDate,
1149 'description': officialTitle,
1151 results.append(info)
1156 class EscapistIE(InfoExtractor):
1157 """Information extractor for The Escapist """
1159 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1160 IE_NAME = u'escapist'
1162 def _real_extract(self, url):
1163 mobj = re.match(self._VALID_URL, url)
1165 raise ExtractorError(u'Invalid URL: %s' % url)
1166 showName = mobj.group('showname')
1167 videoId = mobj.group('episode')
1169 self.report_extraction(videoId)
1170 webpage = self._download_webpage(url, videoId)
1172 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1173 webpage, u'description', fatal=False)
1175 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1176 webpage, u'thumbnail', fatal=False)
1178 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1179 webpage, u'player url')
1181 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1182 webpage, u'player url').split(' : ')[-1]
1184 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1185 configUrl = compat_urllib_parse.unquote(configUrl)
1187 configJSON = self._download_webpage(configUrl, videoId,
1188 u'Downloading configuration',
1189 u'unable to download configuration')
1191 # Technically, it's JavaScript, not JSON
1192 configJSON = configJSON.replace("'", '"')
1195 config = json.loads(configJSON)
1196 except (ValueError,) as err:
1197 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1199 playlist = config['playlist']
1200 videoUrl = playlist[1]['url']
1205 'uploader': showName,
1206 'upload_date': None,
1209 'thumbnail': imgUrl,
1210 'description': videoDesc,
1211 'player_url': playerUrl,
1216 class CollegeHumorIE(InfoExtractor):
1217 """Information extractor for collegehumor.com"""
1220 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1221 IE_NAME = u'collegehumor'
1223 def report_manifest(self, video_id):
1224 """Report information extraction."""
1225 self.to_screen(u'%s: Downloading XML manifest' % video_id)
1227 def _real_extract(self, url):
1228 mobj = re.match(self._VALID_URL, url)
1230 raise ExtractorError(u'Invalid URL: %s' % url)
1231 video_id = mobj.group('videoid')
1236 'upload_date': None,
1239 self.report_extraction(video_id)
1240 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1242 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1243 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1244 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1246 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1248 videoNode = mdoc.findall('./video')[0]
1249 info['description'] = videoNode.findall('./description')[0].text
1250 info['title'] = videoNode.findall('./caption')[0].text
1251 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1252 manifest_url = videoNode.findall('./file')[0].text
1254 raise ExtractorError(u'Invalid metadata XML file')
1256 manifest_url += '?hdcore=2.10.3'
1257 self.report_manifest(video_id)
1259 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1260 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1261 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1263 adoc = xml.etree.ElementTree.fromstring(manifestXml)
1265 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1266 node_id = media_node.attrib['url']
1267 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1268 except IndexError as err:
1269 raise ExtractorError(u'Invalid manifest file')
1271 url_pr = compat_urllib_parse_urlparse(manifest_url)
1272 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1279 class XVideosIE(InfoExtractor):
1280 """Information extractor for xvideos.com"""
1282 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1283 IE_NAME = u'xvideos'
1285 def _real_extract(self, url):
1286 mobj = re.match(self._VALID_URL, url)
1288 raise ExtractorError(u'Invalid URL: %s' % url)
1289 video_id = mobj.group(1)
1291 webpage = self._download_webpage(url, video_id)
1293 self.report_extraction(video_id)
1296 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1297 webpage, u'video URL'))
1300 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1303 # Extract video thumbnail
1304 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1305 webpage, u'thumbnail', fatal=False)
1311 'upload_date': None,
1312 'title': video_title,
1314 'thumbnail': video_thumbnail,
1315 'description': None,
1321 class SoundcloudIE(InfoExtractor):
1322 """Information extractor for soundcloud.com
1323 To access the media, the uid of the song and a stream token
1324 must be extracted from the page source and the script must make
1325 a request to media.soundcloud.com/crossdomain.xml. Then
1326 the media can be grabbed by requesting from an url composed
1327 of the stream token and uid
1330 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1331 IE_NAME = u'soundcloud'
1333 def report_resolve(self, video_id):
1334 """Report information extraction."""
1335 self.to_screen(u'%s: Resolving id' % video_id)
1337 def _real_extract(self, url):
1338 mobj = re.match(self._VALID_URL, url)
1340 raise ExtractorError(u'Invalid URL: %s' % url)
1342 # extract uploader (which is in the url)
1343 uploader = mobj.group(1)
1344 # extract simple title (uploader + slug of song title)
1345 slug_title = mobj.group(2)
1346 simple_title = uploader + u'-' + slug_title
1347 full_title = '%s/%s' % (uploader, slug_title)
1349 self.report_resolve(full_title)
1351 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1352 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1353 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1355 info = json.loads(info_json)
1356 video_id = info['id']
1357 self.report_extraction(full_title)
1359 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1360 stream_json = self._download_webpage(streams_url, full_title,
1361 u'Downloading stream definitions',
1362 u'unable to download stream definitions')
1364 streams = json.loads(stream_json)
1365 mediaURL = streams['http_mp3_128_url']
1366 upload_date = unified_strdate(info['created_at'])
1371 'uploader': info['user']['username'],
1372 'upload_date': upload_date,
1373 'title': info['title'],
1375 'description': info['description'],
1378 class SoundcloudSetIE(InfoExtractor):
1379 """Information extractor for soundcloud.com sets
1380 To access the media, the uid of the song and a stream token
1381 must be extracted from the page source and the script must make
1382 a request to media.soundcloud.com/crossdomain.xml. Then
1383 the media can be grabbed by requesting from an url composed
1384 of the stream token and uid
1387 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1388 IE_NAME = u'soundcloud:set'
1390 def report_resolve(self, video_id):
1391 """Report information extraction."""
1392 self.to_screen(u'%s: Resolving id' % video_id)
1394 def _real_extract(self, url):
1395 mobj = re.match(self._VALID_URL, url)
1397 raise ExtractorError(u'Invalid URL: %s' % url)
1399 # extract uploader (which is in the url)
1400 uploader = mobj.group(1)
1401 # extract simple title (uploader + slug of song title)
1402 slug_title = mobj.group(2)
1403 simple_title = uploader + u'-' + slug_title
1404 full_title = '%s/sets/%s' % (uploader, slug_title)
1406 self.report_resolve(full_title)
1408 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1409 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1410 info_json = self._download_webpage(resolv_url, full_title)
1413 info = json.loads(info_json)
1414 if 'errors' in info:
1415 for err in info['errors']:
1416 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1419 self.report_extraction(full_title)
1420 for track in info['tracks']:
1421 video_id = track['id']
1423 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1424 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1426 self.report_extraction(video_id)
1427 streams = json.loads(stream_json)
1428 mediaURL = streams['http_mp3_128_url']
1433 'uploader': track['user']['username'],
1434 'upload_date': unified_strdate(track['created_at']),
1435 'title': track['title'],
1437 'description': track['description'],
1442 class InfoQIE(InfoExtractor):
1443 """Information extractor for infoq.com"""
1444 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1446 def _real_extract(self, url):
1447 mobj = re.match(self._VALID_URL, url)
1449 raise ExtractorError(u'Invalid URL: %s' % url)
1451 webpage = self._download_webpage(url, video_id=url)
1452 self.report_extraction(url)
1455 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1457 raise ExtractorError(u'Unable to extract video url')
1458 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1459 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1462 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1465 # Extract description
1466 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1467 webpage, u'description', fatal=False)
1469 video_filename = video_url.split('/')[-1]
1470 video_id, extension = video_filename.split('.')
1476 'upload_date': None,
1477 'title': video_title,
1478 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1480 'description': video_description,
1485 class MixcloudIE(InfoExtractor):
1486 """Information extractor for www.mixcloud.com"""
1488 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1489 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1490 IE_NAME = u'mixcloud'
1492 def report_download_json(self, file_id):
1493 """Report JSON download."""
1494 self.to_screen(u'Downloading json')
1496 def get_urls(self, jsonData, fmt, bitrate='best'):
1497 """Get urls from 'audio_formats' section in json"""
1500 bitrate_list = jsonData[fmt]
1501 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1502 bitrate = max(bitrate_list) # select highest
1504 url_list = jsonData[fmt][bitrate]
1505 except TypeError: # we have no bitrate info.
1506 url_list = jsonData[fmt]
1509 def check_urls(self, url_list):
1510 """Returns 1st active url from list"""
1511 for url in url_list:
1513 compat_urllib_request.urlopen(url)
1515 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1520 def _print_formats(self, formats):
1521 print('Available formats:')
1522 for fmt in formats.keys():
1523 for b in formats[fmt]:
1525 ext = formats[fmt][b][0]
1526 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1527 except TypeError: # we have no bitrate info
1528 ext = formats[fmt][0]
1529 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1532 def _real_extract(self, url):
1533 mobj = re.match(self._VALID_URL, url)
1535 raise ExtractorError(u'Invalid URL: %s' % url)
1536 # extract uploader & filename from url
1537 uploader = mobj.group(1).decode('utf-8')
1538 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1540 # construct API request
1541 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1542 # retrieve .json file with links to files
1543 request = compat_urllib_request.Request(file_url)
1545 self.report_download_json(file_url)
1546 jsonData = compat_urllib_request.urlopen(request).read()
1547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1548 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1551 json_data = json.loads(jsonData)
1552 player_url = json_data['player_swf_url']
1553 formats = dict(json_data['audio_formats'])
1555 req_format = self._downloader.params.get('format', None)
1558 if self._downloader.params.get('listformats', None):
1559 self._print_formats(formats)
1562 if req_format is None or req_format == 'best':
1563 for format_param in formats.keys():
1564 url_list = self.get_urls(formats, format_param)
1566 file_url = self.check_urls(url_list)
1567 if file_url is not None:
1570 if req_format not in formats:
1571 raise ExtractorError(u'Format is not available')
1573 url_list = self.get_urls(formats, req_format)
1574 file_url = self.check_urls(url_list)
1575 format_param = req_format
1578 'id': file_id.decode('utf-8'),
1579 'url': file_url.decode('utf-8'),
1580 'uploader': uploader.decode('utf-8'),
1581 'upload_date': None,
1582 'title': json_data['name'],
1583 'ext': file_url.split('.')[-1].decode('utf-8'),
1584 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1585 'thumbnail': json_data['thumbnail_url'],
1586 'description': json_data['description'],
1587 'player_url': player_url.decode('utf-8'),
1590 class StanfordOpenClassroomIE(InfoExtractor):
1591 """Information extractor for Stanford's Open ClassRoom"""
1593 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1594 IE_NAME = u'stanfordoc'
1596 def _real_extract(self, url):
1597 mobj = re.match(self._VALID_URL, url)
1599 raise ExtractorError(u'Invalid URL: %s' % url)
1601 if mobj.group('course') and mobj.group('video'): # A specific video
1602 course = mobj.group('course')
1603 video = mobj.group('video')
1605 'id': course + '_' + video,
1607 'upload_date': None,
1610 self.report_extraction(info['id'])
1611 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1612 xmlUrl = baseUrl + video + '.xml'
1614 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1615 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1616 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1617 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1619 info['title'] = mdoc.findall('./title')[0].text
1620 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1622 raise ExtractorError(u'Invalid metadata XML file')
1623 info['ext'] = info['url'].rpartition('.')[2]
1625 elif mobj.group('course'): # A course page
1626 course = mobj.group('course')
1631 'upload_date': None,
1634 coursepage = self._download_webpage(url, info['id'],
1635 note='Downloading course info page',
1636 errnote='Unable to download course info page')
1638 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1640 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1641 coursepage, u'description', fatal=False)
1643 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1646 'type': 'reference',
1647 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1651 for entry in info['list']:
1652 assert entry['type'] == 'reference'
1653 results += self.extract(entry['url'])
1657 'id': 'Stanford OpenClassroom',
1660 'upload_date': None,
1663 self.report_download_webpage(info['id'])
1664 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1666 rootpage = compat_urllib_request.urlopen(rootURL).read()
1667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1668 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1670 info['title'] = info['id']
1672 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1675 'type': 'reference',
1676 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1681 for entry in info['list']:
1682 assert entry['type'] == 'reference'
1683 results += self.extract(entry['url'])
1686 class MTVIE(InfoExtractor):
1687 """Information extractor for MTV.com"""
1689 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1692 def _real_extract(self, url):
1693 mobj = re.match(self._VALID_URL, url)
1695 raise ExtractorError(u'Invalid URL: %s' % url)
1696 if not mobj.group('proto'):
1697 url = 'http://' + url
1698 video_id = mobj.group('videoid')
1700 webpage = self._download_webpage(url, video_id)
1702 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1703 webpage, u'song name', fatal=False)
1705 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1708 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1709 webpage, u'mtvn_uri', fatal=False)
1711 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1712 webpage, u'content id', fatal=False)
1714 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1715 self.report_extraction(video_id)
1716 request = compat_urllib_request.Request(videogen_url)
1718 metadataXml = compat_urllib_request.urlopen(request).read()
1719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1722 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1723 renditions = mdoc.findall('.//rendition')
1725 # For now, always pick the highest quality.
1726 rendition = renditions[-1]
1729 _,_,ext = rendition.attrib['type'].partition('/')
1730 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1731 video_url = rendition.find('./src').text
1733 raise ExtractorError('Invalid rendition field.')
1738 'uploader': performer,
1739 'upload_date': None,
1740 'title': video_title,
1748 class YoukuIE(InfoExtractor):
1749 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1752 nowTime = int(time.time() * 1000)
1753 random1 = random.randint(1000,1998)
1754 random2 = random.randint(1000,9999)
1756 return "%d%d%d" %(nowTime,random1,random2)
1758 def _get_file_ID_mix_string(self, seed):
1760 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1762 for i in range(len(source)):
1763 seed = (seed * 211 + 30031 ) % 65536
1764 index = math.floor(seed / 65536 * len(source) )
1765 mixed.append(source[int(index)])
1766 source.remove(source[int(index)])
1767 #return ''.join(mixed)
1770 def _get_file_id(self, fileId, seed):
1771 mixed = self._get_file_ID_mix_string(seed)
1772 ids = fileId.split('*')
1776 realId.append(mixed[int(ch)])
1777 return ''.join(realId)
1779 def _real_extract(self, url):
1780 mobj = re.match(self._VALID_URL, url)
1782 raise ExtractorError(u'Invalid URL: %s' % url)
1783 video_id = mobj.group('ID')
1785 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1787 jsondata = self._download_webpage(info_url, video_id)
1789 self.report_extraction(video_id)
1791 config = json.loads(jsondata)
1793 video_title = config['data'][0]['title']
1794 seed = config['data'][0]['seed']
1796 format = self._downloader.params.get('format', None)
1797 supported_format = list(config['data'][0]['streamfileids'].keys())
1799 if format is None or format == 'best':
1800 if 'hd2' in supported_format:
1805 elif format == 'worst':
1813 fileid = config['data'][0]['streamfileids'][format]
1814 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1815 except (UnicodeDecodeError, ValueError, KeyError):
1816 raise ExtractorError(u'Unable to extract info section')
1819 sid = self._gen_sid()
1820 fileid = self._get_file_id(fileid, seed)
1822 #column 8,9 of fileid represent the segment number
1823 #fileid[7:9] should be changed
1824 for index, key in enumerate(keys):
1826 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1827 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1830 'id': '%s_part%02d' % (video_id, index),
1831 'url': download_url,
1833 'upload_date': None,
1834 'title': video_title,
1837 files_info.append(info)
1842 class XNXXIE(InfoExtractor):
1843 """Information extractor for xnxx.com"""
1845 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1847 VIDEO_URL_RE = r'flv_url=(.*?)&'
1848 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1849 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1851 def _real_extract(self, url):
1852 mobj = re.match(self._VALID_URL, url)
1854 raise ExtractorError(u'Invalid URL: %s' % url)
1855 video_id = mobj.group(1)
1857 # Get webpage content
1858 webpage = self._download_webpage(url, video_id)
1860 video_url = self._search_regex(self.VIDEO_URL_RE,
1861 webpage, u'video URL')
1862 video_url = compat_urllib_parse.unquote(video_url)
1864 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1867 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1868 webpage, u'thumbnail', fatal=False)
1874 'upload_date': None,
1875 'title': video_title,
1877 'thumbnail': video_thumbnail,
1878 'description': None,
1882 class GooglePlusIE(InfoExtractor):
1883 """Information extractor for plus.google.com."""
1885 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1886 IE_NAME = u'plus.google'
1888 def _real_extract(self, url):
1889 # Extract id from URL
1890 mobj = re.match(self._VALID_URL, url)
1892 raise ExtractorError(u'Invalid URL: %s' % url)
1894 post_url = mobj.group(0)
1895 video_id = mobj.group(1)
1897 video_extension = 'flv'
1899 # Step 1, Retrieve post webpage to extract further information
1900 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1902 self.report_extraction(video_id)
1904 # Extract update date
1905 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1906 webpage, u'upload date', fatal=False)
1908 # Convert timestring to a format suitable for filename
1909 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1910 upload_date = upload_date.strftime('%Y%m%d')
1913 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1914 webpage, u'uploader', fatal=False)
1917 # Get the first line for title
1918 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1919 webpage, 'title', default=u'NA')
1921 # Step 2, Stimulate clicking the image box to launch video
1922 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1923 webpage, u'video page URL')
1924 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1926 # Extract video links on video page
1927 """Extract video links of all sizes"""
1928 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1929 mobj = re.findall(pattern, webpage)
1931 raise ExtractorError(u'Unable to extract video links')
1933 # Sort in resolution
1934 links = sorted(mobj)
1936 # Choose the lowest of the sort, i.e. highest resolution
1937 video_url = links[-1]
1938 # Only get the url. The resolution part in the tuple has no use anymore
1939 video_url = video_url[-1]
1940 # Treat escaped \u0026 style hex
1942 video_url = video_url.decode("unicode_escape")
1943 except AttributeError: # Python 3
1944 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1950 'uploader': uploader,
1951 'upload_date': upload_date,
1952 'title': video_title,
1953 'ext': video_extension,
1956 class NBAIE(InfoExtractor):
1957 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1960 def _real_extract(self, url):
1961 mobj = re.match(self._VALID_URL, url)
1963 raise ExtractorError(u'Invalid URL: %s' % url)
1965 video_id = mobj.group(1)
1967 webpage = self._download_webpage(url, video_id)
1969 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1971 shortened_video_id = video_id.rpartition('/')[2]
1972 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1973 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1975 # It isn't there in the HTML it returns to us
1976 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1978 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1981 'id': shortened_video_id,
1985 # 'uploader_date': uploader_date,
1986 'description': description,
1990 class JustinTVIE(InfoExtractor):
1991 """Information extractor for justin.tv and twitch.tv"""
1992 # TODO: One broadcast may be split into multiple videos. The key
1993 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1994 # starts at 1 and increases. Can we treat all parts as one video?
1996 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1998 (?P<channelid>[^/]+)|
1999 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2000 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2004 _JUSTIN_PAGE_LIMIT = 100
2005 IE_NAME = u'justin.tv'
2007 def report_download_page(self, channel, offset):
2008 """Report attempt to download a single page of videos."""
2009 self.to_screen(u'%s: Downloading video information from %d to %d' %
2010 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2012 # Return count of items, list of *valid* items
2013 def _parse_page(self, url, video_id):
2014 webpage = self._download_webpage(url, video_id,
2015 u'Downloading video info JSON',
2016 u'unable to download video info JSON')
2018 response = json.loads(webpage)
2019 if type(response) != list:
2020 error_text = response.get('error', 'unknown error')
2021 raise ExtractorError(u'Justin.tv API: %s' % error_text)
2023 for clip in response:
2024 video_url = clip['video_file_url']
2026 video_extension = os.path.splitext(video_url)[1][1:]
2027 video_date = re.sub('-', '', clip['start_time'][:10])
2028 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2029 video_id = clip['id']
2030 video_title = clip.get('title', video_id)
2034 'title': video_title,
2035 'uploader': clip.get('channel_name', video_uploader_id),
2036 'uploader_id': video_uploader_id,
2037 'upload_date': video_date,
2038 'ext': video_extension,
2040 return (len(response), info)
2042 def _real_extract(self, url):
2043 mobj = re.match(self._VALID_URL, url)
2045 raise ExtractorError(u'invalid URL: %s' % url)
2047 api_base = 'http://api.justin.tv'
2049 if mobj.group('channelid'):
2051 video_id = mobj.group('channelid')
2052 api = api_base + '/channel/archives/%s.json' % video_id
2053 elif mobj.group('chapterid'):
2054 chapter_id = mobj.group('chapterid')
2056 webpage = self._download_webpage(url, chapter_id)
2057 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2059 raise ExtractorError(u'Cannot find archive of a chapter')
2060 archive_id = m.group(1)
2062 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2063 chapter_info_xml = self._download_webpage(api, chapter_id,
2064 note=u'Downloading chapter information',
2065 errnote=u'Chapter information download failed')
2066 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2067 for a in doc.findall('.//archive'):
2068 if archive_id == a.find('./id').text:
2071 raise ExtractorError(u'Could not find chapter in chapter information')
2073 video_url = a.find('./video_file_url').text
2074 video_ext = video_url.rpartition('.')[2] or u'flv'
2076 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2077 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2078 note='Downloading chapter metadata',
2079 errnote='Download of chapter metadata failed')
2080 chapter_info = json.loads(chapter_info_json)
2082 bracket_start = int(doc.find('.//bracket_start').text)
2083 bracket_end = int(doc.find('.//bracket_end').text)
2085 # TODO determine start (and probably fix up file)
2086 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2087 #video_url += u'?start=' + TODO:start_timestamp
2088 # bracket_start is 13290, but we want 51670615
2089 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2090 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2093 'id': u'c' + chapter_id,
2096 'title': chapter_info['title'],
2097 'thumbnail': chapter_info['preview'],
2098 'description': chapter_info['description'],
2099 'uploader': chapter_info['channel']['display_name'],
2100 'uploader_id': chapter_info['channel']['name'],
2104 video_id = mobj.group('videoid')
2105 api = api_base + '/broadcast/by_archive/%s.json' % video_id
2107 self.report_extraction(video_id)
2111 limit = self._JUSTIN_PAGE_LIMIT
2114 self.report_download_page(video_id, offset)
2115 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2116 page_count, page_info = self._parse_page(page_url, video_id)
2117 info.extend(page_info)
2118 if not paged or page_count != limit:
2123 class FunnyOrDieIE(InfoExtractor):
2124 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2126 def _real_extract(self, url):
2127 mobj = re.match(self._VALID_URL, url)
2129 raise ExtractorError(u'invalid URL: %s' % url)
2131 video_id = mobj.group('id')
2132 webpage = self._download_webpage(url, video_id)
2134 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2135 webpage, u'video URL', flags=re.DOTALL)
2137 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2138 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2140 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2141 webpage, u'description', fatal=False, flags=re.DOTALL)
2148 'description': video_description,
2152 class SteamIE(InfoExtractor):
2153 _VALID_URL = r"""http://store\.steampowered\.com/
2155 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2157 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2159 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2160 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2163 def suitable(cls, url):
2164 """Receives a URL and returns True if suitable for this IE."""
2165 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2167 def _real_extract(self, url):
2168 m = re.match(self._VALID_URL, url, re.VERBOSE)
2169 gameID = m.group('gameID')
2171 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2172 webpage = self._download_webpage(videourl, gameID)
2174 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2175 videourl = self._AGECHECK_TEMPLATE % gameID
2176 self.report_age_confirmation()
2177 webpage = self._download_webpage(videourl, gameID)
2179 self.report_extraction(gameID)
2180 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2181 webpage, 'game title')
2183 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2184 mweb = re.finditer(urlRE, webpage)
2185 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2186 titles = re.finditer(namesRE, webpage)
2187 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2188 thumbs = re.finditer(thumbsRE, webpage)
2190 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2191 video_id = vid.group('videoID')
2192 title = vtitle.group('videoName')
2193 video_url = vid.group('videoURL')
2194 video_thumb = thumb.group('thumbnail')
2196 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2201 'title': unescapeHTML(title),
2202 'thumbnail': video_thumb
2205 return [self.playlist_result(videos, gameID, game_title)]
2207 class UstreamIE(InfoExtractor):
2208 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2209 IE_NAME = u'ustream'
2211 def _real_extract(self, url):
2212 m = re.match(self._VALID_URL, url)
2213 video_id = m.group('videoID')
2215 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2216 webpage = self._download_webpage(url, video_id)
2218 self.report_extraction(video_id)
2220 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2223 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2224 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2226 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2227 webpage, u'thumbnail', fatal=False)
2233 'title': video_title,
2234 'uploader': uploader,
2235 'thumbnail': thumbnail,
2239 class WorldStarHipHopIE(InfoExtractor):
2240 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2241 IE_NAME = u'WorldStarHipHop'
2243 def _real_extract(self, url):
2244 m = re.match(self._VALID_URL, url)
2245 video_id = m.group('id')
2247 webpage_src = self._download_webpage(url, video_id)
2249 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2250 webpage_src, u'video URL')
2252 if 'mp4' in video_url:
2257 video_title = self._html_search_regex(r"<title>(.*)</title>",
2258 webpage_src, u'title')
2260 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2261 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2262 webpage_src, u'thumbnail', fatal=False)
2265 _title = r"""candytitles.*>(.*)</span>"""
2266 mobj = re.search(_title, webpage_src)
2267 if mobj is not None:
2268 video_title = mobj.group(1)
2273 'title' : video_title,
2274 'thumbnail' : thumbnail,
2279 class RBMARadioIE(InfoExtractor):
2280 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2282 def _real_extract(self, url):
2283 m = re.match(self._VALID_URL, url)
2284 video_id = m.group('videoID')
2286 webpage = self._download_webpage(url, video_id)
2288 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2289 webpage, u'json data', flags=re.MULTILINE)
2292 data = json.loads(json_data)
2293 except ValueError as e:
2294 raise ExtractorError(u'Invalid JSON: ' + str(e))
2296 video_url = data['akamai_url'] + '&cbr=256'
2297 url_parts = compat_urllib_parse_urlparse(video_url)
2298 video_ext = url_parts.path.rpartition('.')[2]
2303 'title': data['title'],
2304 'description': data.get('teaser_text'),
2305 'location': data.get('country_of_origin'),
2306 'uploader': data.get('host', {}).get('name'),
2307 'uploader_id': data.get('host', {}).get('slug'),
2308 'thumbnail': data.get('image', {}).get('large_url_2x'),
2309 'duration': data.get('duration'),
2314 class YouPornIE(InfoExtractor):
2315 """Information extractor for youporn.com."""
2316 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2318 def _print_formats(self, formats):
2319 """Print all available formats"""
2320 print(u'Available formats:')
2321 print(u'ext\t\tformat')
2322 print(u'---------------------------------')
2323 for format in formats:
2324 print(u'%s\t\t%s' % (format['ext'], format['format']))
2326 def _specific(self, req_format, formats):
2328 if(x["format"]==req_format):
2332 def _real_extract(self, url):
2333 mobj = re.match(self._VALID_URL, url)
2335 raise ExtractorError(u'Invalid URL: %s' % url)
2336 video_id = mobj.group('videoid')
2338 req = compat_urllib_request.Request(url)
2339 req.add_header('Cookie', 'age_verified=1')
2340 webpage = self._download_webpage(req, video_id)
2342 # Get JSON parameters
2343 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2345 params = json.loads(json_params)
2347 raise ExtractorError(u'Invalid JSON')
2349 self.report_extraction(video_id)
2351 video_title = params['title']
2352 upload_date = unified_strdate(params['release_date_f'])
2353 video_description = params['description']
2354 video_uploader = params['submitted_by']
2355 thumbnail = params['thumbnails'][0]['image']
2357 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2359 # Get all of the formats available
2360 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2361 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2362 webpage, u'download list').strip()
2364 # Get all of the links from the page
2365 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2366 links = re.findall(LINK_RE, download_list_html)
2367 if(len(links) == 0):
2368 raise ExtractorError(u'ERROR: no known formats available for video')
2370 self.to_screen(u'Links found: %d' % len(links))
2375 # A link looks like this:
2376 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2377 # A path looks like this:
2378 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2379 video_url = unescapeHTML( link )
2380 path = compat_urllib_parse_urlparse( video_url ).path
2381 extension = os.path.splitext( path )[1][1:]
2382 format = path.split('/')[4].split('_')[:2]
2385 format = "-".join( format )
2386 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2391 'uploader': video_uploader,
2392 'upload_date': upload_date,
2393 'title': video_title,
2396 'thumbnail': thumbnail,
2397 'description': video_description
2400 if self._downloader.params.get('listformats', None):
2401 self._print_formats(formats)
2404 req_format = self._downloader.params.get('format', None)
2405 self.to_screen(u'Format: %s' % req_format)
2407 if req_format is None or req_format == 'best':
2409 elif req_format == 'worst':
2410 return [formats[-1]]
2411 elif req_format in ('-1', 'all'):
2414 format = self._specific( req_format, formats )
2416 raise ExtractorError(u'Requested format not available')
2421 class PornotubeIE(InfoExtractor):
2422 """Information extractor for pornotube.com."""
2423 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2425 def _real_extract(self, url):
2426 mobj = re.match(self._VALID_URL, url)
2428 raise ExtractorError(u'Invalid URL: %s' % url)
2430 video_id = mobj.group('videoid')
2431 video_title = mobj.group('title')
2433 # Get webpage content
2434 webpage = self._download_webpage(url, video_id)
2437 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2438 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2439 video_url = compat_urllib_parse.unquote(video_url)
2441 #Get the uploaded date
2442 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2443 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2444 if upload_date: upload_date = unified_strdate(upload_date)
2446 info = {'id': video_id,
2449 'upload_date': upload_date,
2450 'title': video_title,
2456 class YouJizzIE(InfoExtractor):
2457 """Information extractor for youjizz.com."""
2458 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2460 def _real_extract(self, url):
2461 mobj = re.match(self._VALID_URL, url)
2463 raise ExtractorError(u'Invalid URL: %s' % url)
2465 video_id = mobj.group('videoid')
2467 # Get webpage content
2468 webpage = self._download_webpage(url, video_id)
2470 # Get the video title
2471 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2472 webpage, u'title').strip()
2474 # Get the embed page
2475 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2477 raise ExtractorError(u'ERROR: unable to extract embed page')
2479 embed_page_url = result.group(0).strip()
2480 video_id = result.group('videoid')
2482 webpage = self._download_webpage(embed_page_url, video_id)
2485 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2486 webpage, u'video URL')
2488 info = {'id': video_id,
2490 'title': video_title,
2493 'player_url': embed_page_url}
2497 class EightTracksIE(InfoExtractor):
2499 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2501 def _real_extract(self, url):
2502 mobj = re.match(self._VALID_URL, url)
2504 raise ExtractorError(u'Invalid URL: %s' % url)
2505 playlist_id = mobj.group('id')
2507 webpage = self._download_webpage(url, playlist_id)
2509 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2510 data = json.loads(json_like)
2512 session = str(random.randint(0, 1000000000))
2514 track_count = data['tracks_count']
2515 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2516 next_url = first_url
2518 for i in itertools.count():
2519 api_json = self._download_webpage(next_url, playlist_id,
2520 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2521 errnote=u'Failed to download song information')
2522 api_data = json.loads(api_json)
2523 track_data = api_data[u'set']['track']
2525 'id': track_data['id'],
2526 'url': track_data['track_file_stream_url'],
2527 'title': track_data['performer'] + u' - ' + track_data['name'],
2528 'raw_title': track_data['name'],
2529 'uploader_id': data['user']['login'],
2533 if api_data['set']['at_last_track']:
2535 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2538 class KeekIE(InfoExtractor):
2539 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2542 def _real_extract(self, url):
2543 m = re.match(self._VALID_URL, url)
2544 video_id = m.group('videoID')
2546 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2547 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2548 webpage = self._download_webpage(url, video_id)
2550 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2553 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2554 webpage, u'uploader', fatal=False)
2560 'title': video_title,
2561 'thumbnail': thumbnail,
2562 'uploader': uploader
2566 class TEDIE(InfoExtractor):
2567 _VALID_URL=r'''http://www\.ted\.com/
2569 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2571 ((?P<type_talk>talks)) # We have a simple talk
2573 (/lang/(.*?))? # The url may contain the language
2574 /(?P<name>\w+) # Here goes the name and then ".html"
2578 def suitable(cls, url):
2579 """Receives a URL and returns True if suitable for this IE."""
2580 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2582 def _real_extract(self, url):
2583 m=re.match(self._VALID_URL, url, re.VERBOSE)
2584 if m.group('type_talk'):
2585 return [self._talk_info(url)]
2587 playlist_id=m.group('playlist_id')
2588 name=m.group('name')
2589 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2590 return [self._playlist_videos_info(url,name,playlist_id)]
2592 def _playlist_videos_info(self,url,name,playlist_id=0):
2593 '''Returns the videos of the playlist'''
2595 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2596 ([.\s]*?)data-playlist_item_id="(\d+)"
2597 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2599 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2600 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2601 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2602 m_names=re.finditer(video_name_RE,webpage)
2604 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2605 webpage, 'playlist title')
2607 playlist_entries = []
2608 for m_video, m_name in zip(m_videos,m_names):
2609 video_id=m_video.group('video_id')
2610 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2611 playlist_entries.append(self.url_result(talk_url, 'TED'))
2612 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2614 def _talk_info(self, url, video_id=0):
2615 """Return the video for the talk in the url"""
2616 m = re.match(self._VALID_URL, url,re.VERBOSE)
2617 video_name = m.group('name')
2618 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2619 self.report_extraction(video_name)
2620 # If the url includes the language we get the title translated
2621 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2623 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2624 webpage, 'json data')
2625 info = json.loads(json_data)
2626 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2627 webpage, 'description', flags = re.DOTALL)
2629 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2630 webpage, 'thumbnail')
2633 'url': info['htmlStreams'][-1]['file'],
2636 'thumbnail': thumbnail,
2637 'description': desc,
2641 class MySpassIE(InfoExtractor):
2642 _VALID_URL = r'http://www.myspass.de/.*'
2644 def _real_extract(self, url):
2645 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2647 # video id is the last path element of the URL
2648 # usually there is a trailing slash, so also try the second but last
2649 url_path = compat_urllib_parse_urlparse(url).path
2650 url_parent_path, video_id = os.path.split(url_path)
2652 _, video_id = os.path.split(url_parent_path)
2655 metadata_url = META_DATA_URL_TEMPLATE % video_id
2656 metadata_text = self._download_webpage(metadata_url, video_id)
2657 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2659 # extract values from metadata
2660 url_flv_el = metadata.find('url_flv')
2661 if url_flv_el is None:
2662 raise ExtractorError(u'Unable to extract download url')
2663 video_url = url_flv_el.text
2664 extension = os.path.splitext(video_url)[1][1:]
2665 title_el = metadata.find('title')
2666 if title_el is None:
2667 raise ExtractorError(u'Unable to extract title')
2668 title = title_el.text
2669 format_id_el = metadata.find('format_id')
2670 if format_id_el is None:
2673 format = format_id_el.text
2674 description_el = metadata.find('description')
2675 if description_el is not None:
2676 description = description_el.text
2679 imagePreview_el = metadata.find('imagePreview')
2680 if imagePreview_el is not None:
2681 thumbnail = imagePreview_el.text
2690 'thumbnail': thumbnail,
2691 'description': description
2695 class SpiegelIE(InfoExtractor):
2696 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2698 def _real_extract(self, url):
2699 m = re.match(self._VALID_URL, url)
2700 video_id = m.group('videoID')
2702 webpage = self._download_webpage(url, video_id)
2704 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2707 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2708 xml_code = self._download_webpage(xml_url, video_id,
2709 note=u'Downloading XML', errnote=u'Failed to download XML')
2711 idoc = xml.etree.ElementTree.fromstring(xml_code)
2712 last_type = idoc[-1]
2713 filename = last_type.findall('./filename')[0].text
2714 duration = float(last_type.findall('./duration')[0].text)
2716 video_url = 'http://video2.spiegel.de/flash/' + filename
2717 video_ext = filename.rpartition('.')[2]
2722 'title': video_title,
2723 'duration': duration,
2727 class LiveLeakIE(InfoExtractor):
2729 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2730 IE_NAME = u'liveleak'
2732 def _real_extract(self, url):
2733 mobj = re.match(self._VALID_URL, url)
2735 raise ExtractorError(u'Invalid URL: %s' % url)
2737 video_id = mobj.group('video_id')
2739 webpage = self._download_webpage(url, video_id)
2741 video_url = self._search_regex(r'file: "(.*?)",',
2742 webpage, u'video URL')
2744 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2745 webpage, u'title').replace('LiveLeak.com -', '').strip()
2747 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2748 webpage, u'description', fatal=False)
2750 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2751 webpage, u'uploader', fatal=False)
2757 'title': video_title,
2758 'description': video_description,
2759 'uploader': video_uploader
2764 class ARDIE(InfoExtractor):
2765 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2766 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2767 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2769 def _real_extract(self, url):
2770 # determine video id from url
2771 m = re.match(self._VALID_URL, url)
2773 numid = re.search(r'documentId=([0-9]+)', url)
2775 video_id = numid.group(1)
2777 video_id = m.group('video_id')
2779 # determine title and media streams from webpage
2780 html = self._download_webpage(url, video_id)
2781 title = re.search(self._TITLE, html).group('title')
2782 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2784 assert '"fsk"' in html
2785 raise ExtractorError(u'This video is only available after 8:00 pm')
2787 # choose default media type and highest quality for now
2788 stream = max([s for s in streams if int(s["media_type"]) == 0],
2789 key=lambda s: int(s["quality"]))
2791 # there's two possibilities: RTMP stream or HTTP download
2792 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2793 if stream['rtmp_url']:
2794 self.to_screen(u'RTMP download detected')
2795 assert stream['video_url'].startswith('mp4:')
2796 info["url"] = stream["rtmp_url"]
2797 info["play_path"] = stream['video_url']
2799 assert stream["video_url"].endswith('.mp4')
2800 info["url"] = stream["video_url"]
2803 class ZDFIE(InfoExtractor):
2804 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2805 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2806 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2807 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2808 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2810 def _real_extract(self, url):
2811 mobj = re.match(self._VALID_URL, url)
2813 raise ExtractorError(u'Invalid URL: %s' % url)
2814 video_id = mobj.group('video_id')
2816 html = self._download_webpage(url, video_id)
2817 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2819 raise ExtractorError(u'No media url found.')
2821 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2822 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2823 # choose first/default media type and highest quality for now
2824 for s in streams: #find 300 - dsl1000mbit
2825 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2828 for s in streams: #find veryhigh - dsl2000mbit
2829 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2833 raise ExtractorError(u'No stream found.')
2835 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2837 self.report_extraction(video_id)
2838 mobj = re.search(self._TITLE, html)
2840 raise ExtractorError(u'Cannot extract title')
2841 title = unescapeHTML(mobj.group('title'))
2843 mobj = re.search(self._MMS_STREAM, media_link)
2845 mobj = re.search(self._RTSP_STREAM, media_link)
2847 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2848 mms_url = mobj.group('video_url')
2850 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2852 raise ExtractorError(u'Cannot extract extention')
2853 ext = mobj.group('ext')
2855 return [{'id': video_id,
2861 class TumblrIE(InfoExtractor):
2862 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2864 def _real_extract(self, url):
2865 m_url = re.match(self._VALID_URL, url)
2866 video_id = m_url.group('id')
2867 blog = m_url.group('blog_name')
2869 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2870 webpage = self._download_webpage(url, video_id)
2872 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2873 video = re.search(re_video, webpage)
2875 raise ExtractorError(u'Unable to extract video')
2876 video_url = video.group('video_url')
2877 ext = video.group('ext')
2879 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2880 webpage, u'thumbnail', fatal=False) # We pick the first poster
2881 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2883 # The only place where you can get a title, it's not complete,
2884 # but searching in other places doesn't work for all videos
2885 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2886 webpage, u'title', flags=re.DOTALL)
2888 return [{'id': video_id,
2890 'title': video_title,
2891 'thumbnail': video_thumbnail,
2895 class BandcampIE(InfoExtractor):
2896 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2898 def _real_extract(self, url):
2899 mobj = re.match(self._VALID_URL, url)
2900 title = mobj.group('title')
2901 webpage = self._download_webpage(url, title)
2902 # We get the link to the free download page
2903 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2904 if m_download is None:
2905 raise ExtractorError(u'No free songs found')
2907 download_link = m_download.group(1)
2908 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2909 webpage, re.MULTILINE|re.DOTALL).group('id')
2911 download_webpage = self._download_webpage(download_link, id,
2912 'Downloading free downloads page')
2913 # We get the dictionary of the track from some javascrip code
2914 info = re.search(r'items: (.*?),$',
2915 download_webpage, re.MULTILINE).group(1)
2916 info = json.loads(info)[0]
2917 # We pick mp3-320 for now, until format selection can be easily implemented.
2918 mp3_info = info[u'downloads'][u'mp3-320']
2919 # If we try to use this url it says the link has expired
2920 initial_url = mp3_info[u'url']
2921 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2922 m_url = re.match(re_url, initial_url)
2923 #We build the url we will use to get the final track url
2924 # This url is build in Bandcamp in the script download_bunde_*.js
2925 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2926 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2927 # If we could correctly generate the .rand field the url would be
2928 #in the "download_url" key
2929 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2931 track_info = {'id':id,
2932 'title' : info[u'title'],
2935 'thumbnail' : info[u'thumb_url'],
2936 'uploader' : info[u'artist']
2941 class RedTubeIE(InfoExtractor):
2942 """Information Extractor for redtube"""
2943 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2945 def _real_extract(self,url):
2946 mobj = re.match(self._VALID_URL, url)
2948 raise ExtractorError(u'Invalid URL: %s' % url)
2950 video_id = mobj.group('id')
2951 video_extension = 'mp4'
2952 webpage = self._download_webpage(url, video_id)
2954 self.report_extraction(video_id)
2956 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2957 webpage, u'video URL')
2959 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2965 'ext': video_extension,
2966 'title': video_title,
2969 class InaIE(InfoExtractor):
2970 """Information Extractor for Ina.fr"""
2971 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2973 def _real_extract(self,url):
2974 mobj = re.match(self._VALID_URL, url)
2976 video_id = mobj.group('id')
2977 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2978 video_extension = 'mp4'
2979 webpage = self._download_webpage(mrss_url, video_id)
2981 self.report_extraction(video_id)
2983 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2984 webpage, u'video URL')
2986 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2992 'ext': video_extension,
2993 'title': video_title,
2996 class HowcastIE(InfoExtractor):
2997 """Information Extractor for Howcast.com"""
2998 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3000 def _real_extract(self, url):
3001 mobj = re.match(self._VALID_URL, url)
3003 video_id = mobj.group('id')
3004 webpage_url = 'http://www.howcast.com/videos/' + video_id
3005 webpage = self._download_webpage(webpage_url, video_id)
3007 self.report_extraction(video_id)
3009 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3010 webpage, u'video URL')
3012 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3015 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3016 webpage, u'description', fatal=False)
3018 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3019 webpage, u'thumbnail', fatal=False)
3025 'title': video_title,
3026 'description': video_description,
3027 'thumbnail': thumbnail,
3030 class VineIE(InfoExtractor):
3031 """Information Extractor for Vine.co"""
3032 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3034 def _real_extract(self, url):
3035 mobj = re.match(self._VALID_URL, url)
3037 video_id = mobj.group('id')
3038 webpage_url = 'https://vine.co/v/' + video_id
3039 webpage = self._download_webpage(webpage_url, video_id)
3041 self.report_extraction(video_id)
3043 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3044 webpage, u'video URL')
3046 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3049 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3050 webpage, u'thumbnail', fatal=False)
3052 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3053 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3059 'title': video_title,
3060 'thumbnail': thumbnail,
3061 'uploader': uploader,
3064 class FlickrIE(InfoExtractor):
3065 """Information Extractor for Flickr videos"""
3066 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3068 def _real_extract(self, url):
3069 mobj = re.match(self._VALID_URL, url)
3071 video_id = mobj.group('id')
3072 video_uploader_id = mobj.group('uploader_id')
3073 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3074 webpage = self._download_webpage(webpage_url, video_id)
3076 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3078 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3079 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3081 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3082 first_xml, u'node_id')
3084 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3085 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3087 self.report_extraction(video_id)
3089 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3091 raise ExtractorError(u'Unable to extract video url')
3092 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3094 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3095 webpage, u'video title')
3097 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3098 webpage, u'description', fatal=False)
3100 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3101 webpage, u'thumbnail', fatal=False)
3107 'title': video_title,
3108 'description': video_description,
3109 'thumbnail': thumbnail,
3110 'uploader_id': video_uploader_id,
3113 class TeamcocoIE(InfoExtractor):
3114 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3116 def _real_extract(self, url):
3117 mobj = re.match(self._VALID_URL, url)
3119 raise ExtractorError(u'Invalid URL: %s' % url)
3120 url_title = mobj.group('url_title')
3121 webpage = self._download_webpage(url, url_title)
3123 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3124 webpage, u'video id')
3126 self.report_extraction(video_id)
3128 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3131 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3132 webpage, u'thumbnail', fatal=False)
3134 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3135 webpage, u'description', fatal=False)
3137 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3138 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3140 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3147 'title': video_title,
3148 'thumbnail': thumbnail,
3149 'description': video_description,
3152 class XHamsterIE(InfoExtractor):
3153 """Information Extractor for xHamster"""
3154 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3156 def _real_extract(self,url):
3157 mobj = re.match(self._VALID_URL, url)
3159 video_id = mobj.group('id')
3160 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3161 webpage = self._download_webpage(mrss_url, video_id)
3163 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3165 raise ExtractorError(u'Unable to extract media URL')
3166 if len(mobj.group('server')) == 0:
3167 video_url = compat_urllib_parse.unquote(mobj.group('file'))
3169 video_url = mobj.group('server')+'/key='+mobj.group('file')
3170 video_extension = video_url.split('.')[-1]
3172 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3175 # Can't see the description anywhere in the UI
3176 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3177 # webpage, u'description', fatal=False)
3178 # if video_description: video_description = unescapeHTML(video_description)
3180 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3182 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3184 video_upload_date = None
3185 self._downloader.report_warning(u'Unable to extract upload date')
3187 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3188 webpage, u'uploader id', default=u'anonymous')
3190 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3191 webpage, u'thumbnail', fatal=False)
3196 'ext': video_extension,
3197 'title': video_title,
3198 # 'description': video_description,
3199 'upload_date': video_upload_date,
3200 'uploader_id': video_uploader_id,
3201 'thumbnail': video_thumbnail
3204 class HypemIE(InfoExtractor):
3205 """Information Extractor for hypem"""
3206 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3208 def _real_extract(self, url):
3209 mobj = re.match(self._VALID_URL, url)
3211 raise ExtractorError(u'Invalid URL: %s' % url)
3212 track_id = mobj.group(1)
3214 data = { 'ax': 1, 'ts': time.time() }
3215 data_encoded = compat_urllib_parse.urlencode(data)
3216 complete_url = url + "?" + data_encoded
3217 request = compat_urllib_request.Request(complete_url)
3218 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3219 cookie = urlh.headers.get('Set-Cookie', '')
3221 self.report_extraction(track_id)
3223 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3224 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3226 track_list = json.loads(html_tracks)
3227 track = track_list[u'tracks'][0]
3229 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3232 track_id = track[u"id"]
3233 artist = track[u"artist"]
3234 title = track[u"song"]
3236 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3237 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3238 request.add_header('cookie', cookie)
3239 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3241 song_data = json.loads(song_data_json)
3243 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3244 final_url = song_data[u"url"]
3254 class Vbox7IE(InfoExtractor):
3255 """Information Extractor for Vbox7"""
3256 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3258 def _real_extract(self,url):
3259 mobj = re.match(self._VALID_URL, url)
3261 raise ExtractorError(u'Invalid URL: %s' % url)
3262 video_id = mobj.group(1)
3264 redirect_page, urlh = self._download_webpage_handle(url, video_id)
3265 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3266 redirect_url = urlh.geturl() + new_location
3267 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3269 title = self._html_search_regex(r'<title>(.*)</title>',
3270 webpage, u'title').split('/')[0].strip()
3273 info_url = "http://vbox7.com/play/magare.do"
3274 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3275 info_request = compat_urllib_request.Request(info_url, data)
3276 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3277 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3278 if info_response is None:
3279 raise ExtractorError(u'Unable to extract the media url')
3280 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3287 'thumbnail': thumbnail_url,
3290 class GametrailersIE(InfoExtractor):
3291 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3293 def _real_extract(self, url):
3294 mobj = re.match(self._VALID_URL, url)
3296 raise ExtractorError(u'Invalid URL: %s' % url)
3297 video_id = mobj.group('id')
3298 video_type = mobj.group('type')
3299 webpage = self._download_webpage(url, video_id)
3300 if video_type == 'full-episodes':
3301 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3303 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3304 mgid = self._search_regex(mgid_re, webpage, u'mgid')
3305 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3307 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3308 video_id, u'Downloading video info')
3309 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3310 video_id, u'Downloading video urls info')
3312 self.report_extraction(video_id)
3313 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3314 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3316 <url>(?P<thumb>.*?)</url>.*
3319 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3321 raise ExtractorError(u'Unable to extract video info')
3322 video_title = m_info.group('title')
3323 video_description = m_info.group('description')
3324 video_thumb = m_info.group('thumb')
3326 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3327 if m_urls is None or len(m_urls) == 0:
3328 raise ExtractError(u'Unable to extrat video url')
3329 # They are sorted from worst to best quality
3330 video_url = m_urls[-1].group('url')
3332 return {'url': video_url,
3334 'title': video_title,
3335 # Videos are actually flv not mp4
3337 'thumbnail': video_thumb,
3338 'description': video_description,
3341 def gen_extractors():
3342 """ Return a list of an instance of every supported extractor.
3343 The order does matter; the first extractor matched is the one handling the URL.
3346 YoutubePlaylistIE(),
3371 StanfordOpenClassroomIE(),
3381 WorldStarHipHopIE(),
3411 def get_info_extractor(ie_name):
3412 """Returns the info extractor class with the given ie_name"""
3413 return globals()[ie_name+'IE']