-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-from __future__ import absolute_import
-
import base64
import datetime
import itertools
from .extractor.ard import ARDIE
from .extractor.arte import ArteTvIE
+from .extractor.bliptv import BlipTVIE, BlipTVUserIE
+from .extractor.comedycentral import ComedyCentralIE
from .extractor.dailymotion import DailymotionIE
from .extractor.gametrailers import GametrailersIE
+from .extractor.generic import GenericIE
+from .extractor.googleplus import GooglePlusIE
+from .extractor.googlesearch import GoogleSearchIE
from .extractor.metacafe import MetacafeIE
+from .extractor.myvideo import MyVideoIE
from .extractor.statigram import StatigramIE
from .extractor.photobucket import PhotobucketIE
from .extractor.vimeo import VimeoIE
-from .extractor.yahoo import YahooIE
+from .extractor.yahoo import YahooIE, YahooSearchIE
from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
from .extractor.zdf import ZDFIE
-class GenericIE(InfoExtractor):
- """Generic last-resort information extractor."""
- _VALID_URL = r'.*'
- IE_NAME = u'generic'
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- if not self._downloader.params.get('test', False):
- self._downloader.report_warning(u'Falling back on generic information extractor.')
- super(GenericIE, self).report_download_webpage(video_id)
- def report_following_redirect(self, new_url):
- """Report information extraction."""
- self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
-
- def _test_redirect(self, url):
- """Check if it is a redirect, like url shorteners, in case return the new url."""
- class HeadRequest(compat_urllib_request.Request):
- def get_method(self):
- return "HEAD"
-
- class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
- """
- Subclass the HTTPRedirectHandler to make it use our
- HeadRequest also on the redirected URL
- """
- def redirect_request(self, req, fp, code, msg, headers, newurl):
- if code in (301, 302, 303, 307):
- newurl = newurl.replace(' ', '%20')
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type"))
- return HeadRequest(newurl,
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
- else:
- raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
-
- class HTTPMethodFallback(compat_urllib_request.BaseHandler):
- """
- Fallback to GET if HEAD is not allowed (405 HTTP error)
- """
- def http_error_405(self, req, fp, code, msg, headers):
- fp.read()
- fp.close()
-
- newheaders = dict((k,v) for k,v in req.headers.items()
- if k.lower() not in ("content-length", "content-type"))
- return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True))
-
- # Build our opener
- opener = compat_urllib_request.OpenerDirector()
- for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
- HTTPMethodFallback, HEADRedirectHandler,
- compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
- opener.add_handler(handler())
-
- response = opener.open(HeadRequest(url))
- if response is None:
- raise ExtractorError(u'Invalid URL protocol')
- new_url = response.geturl()
-
- if url == new_url:
- return False
-
- self.report_following_redirect(new_url)
- return new_url
- def _real_extract(self, url):
- new_url = self._test_redirect(url)
- if new_url: return [self.url_result(new_url)]
- video_id = url.split('/')[-1]
- try:
- webpage = self._download_webpage(url, video_id)
- except ValueError as err:
- # since this is the last-resort InfoExtractor, if
- # this error is thrown, it'll be thrown here
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- self.report_extraction(video_id)
- # Start with something easy: JW Player in SWFObject
- mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
- if mobj is None:
- # Broaden the search a little bit
- mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
- if mobj is None:
- # Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
- if mobj is None:
- # Try to find twitter cards info
- mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
- if mobj is None:
- # We look for Open Graph info:
- # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
- m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
- # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
- if m_video_type is not None:
- mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- # It's possible that one of the regexes
- # matched, but returned an empty group:
- if mobj.group(1) is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- video_url = compat_urllib_parse.unquote(mobj.group(1))
- video_id = os.path.basename(video_url)
-
- # here's a fun little line of code for you:
- video_extension = os.path.splitext(video_id)[1][1:]
- video_id = os.path.splitext(video_id)[0]
-
- # it's tempting to parse this further, but you would
- # have to take into account all the variations like
- # Video Title - Site Name
- # Site Name | Video Title
- # Video Title - Tagline | Site Name
- # and so on and so forth; it's just not practical
- video_title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, u'video title')
-
- # video uploader is domain name
- video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
- url, u'video uploader')
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'uploader': video_uploader,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_extension,
- }]
-
-
-
-class GoogleSearchIE(SearchInfoExtractor):
- """Information Extractor for Google Video search queries."""
- _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
- _MAX_RESULTS = 1000
- IE_NAME = u'video.google:search'
- _SEARCH_KEY = 'gvsearch'
-
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
-
- res = {
- '_type': 'playlist',
- 'id': query,
- 'entries': []
- }
-
- for pagenum in itertools.count(1):
- result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
- webpage = self._download_webpage(result_url, u'gvsearch:' + query,
- note='Downloading result page ' + str(pagenum))
-
- for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
- e = {
- '_type': 'url',
- 'url': mobj.group(1)
- }
- res['entries'].append(e)
-
- if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
- return res
-
-class YahooSearchIE(SearchInfoExtractor):
- """Information Extractor for Yahoo! Video search queries."""
-
- _MAX_RESULTS = 1000
- IE_NAME = u'screen.yahoo:search'
- _SEARCH_KEY = 'yvsearch'
-
- def _get_n_results(self, query, n):
- """Get a specified number of results for a query"""
-
- res = {
- '_type': 'playlist',
- 'id': query,
- 'entries': []
- }
- for pagenum in itertools.count(0):
- result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
- webpage = self._download_webpage(result_url, query,
- note='Downloading results page '+str(pagenum+1))
- info = json.loads(webpage)
- m = info[u'm']
- results = info[u'results']
-
- for (i, r) in enumerate(results):
- if (pagenum * 30) +i >= n:
- break
- mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
- e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
- res['entries'].append(e)
- if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
- break
-
- return res
-class BlipTVUserIE(InfoExtractor):
- """Information Extractor for blip.tv users."""
-
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
- _PAGE_SIZE = 12
- IE_NAME = u'blip.tv:user'
-
- def _real_extract(self, url):
- # Extract username
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- username = mobj.group(1)
-
- page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
-
- page = self._download_webpage(url, username, u'Downloading user page')
- mobj = re.search(r'data-users-id="([^"]+)"', page)
- page_base = page_base % mobj.group(1)
-
-
- # Download video ids using BlipTV Ajax calls. Result size per
- # query is limited (currently to 12 videos) so we need to query
- # page by page until there are no video ids - it means we got
- # all of them.
-
- video_ids = []
- pagenum = 1
-
- while True:
- url = page_base + "&page=" + str(pagenum)
- page = self._download_webpage(url, username,
- u'Downloading video ids from page %d' % pagenum)
-
- # Extract video identifiers
- ids_in_page = []
-
- for mobj in re.finditer(r'href="/([^"]+)"', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(unescapeHTML(mobj.group(1)))
-
- video_ids.extend(ids_in_page)
-
- # A little optimization - if current page is not
- # "full", ie. does not contain PAGE_SIZE video ids then
- # we can assume that this page is the last one - there
- # are no more ids on further pages - no need to query
- # again.
-
- if len(ids_in_page) < self._PAGE_SIZE:
- break
-
- pagenum += 1
-
- urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
- url_entries = [self.url_result(url, 'BlipTV') for url in urls]
- return [self.playlist_result(url_entries, playlist_title = username)]
-
class DepositFilesIE(InfoExtractor):
"""Information extractor for depositfiles.com"""
return [info]
-class BlipTVIE(InfoExtractor):
- """Information extractor for blip.tv"""
-
- _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
- _URL_EXT = r'^.*\.([a-z0-9]+)$'
- IE_NAME = u'blip.tv'
-
- def report_direct_download(self, title):
- """Report information extraction."""
- self.to_screen(u'%s: Direct download detected' % title)
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- # See https://github.com/rg3/youtube-dl/issues/857
- api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
- if api_mobj is not None:
- url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
- urlp = compat_urllib_parse_urlparse(url)
- if urlp.path.startswith('/play/'):
- request = compat_urllib_request.Request(url)
- response = compat_urllib_request.urlopen(request)
- redirecturl = response.geturl()
- rurlp = compat_urllib_parse_urlparse(redirecturl)
- file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
- url = 'http://blip.tv/a/a-' + file_id
- return self._real_extract(url)
-
-
- if '?' in url:
- cchar = '&'
- else:
- cchar = '?'
- json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
- request = compat_urllib_request.Request(json_url)
- request.add_header('User-Agent', 'iTunes/10.6.1')
- self.report_extraction(mobj.group(1))
- info = None
- try:
- urlh = compat_urllib_request.urlopen(request)
- if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
- basename = url.split('/')[-1]
- title,ext = os.path.splitext(basename)
- title = title.decode('UTF-8')
- ext = ext.replace('.', '')
- self.report_direct_download(title)
- info = {
- 'id': title,
- 'url': url,
- 'uploader': None,
- 'upload_date': None,
- 'title': title,
- 'ext': ext,
- 'urlhandle': urlh
- }
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
- if info is None: # Regular URL
- try:
- json_code_bytes = urlh.read()
- json_code = json_code_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
-
- try:
- json_data = json.loads(json_code)
- if 'Post' in json_data:
- data = json_data['Post']
- else:
- data = json_data
-
- upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
- video_url = data['media']['url']
- umobj = re.match(self._URL_EXT, video_url)
- if umobj is None:
- raise ValueError('Can not determine filename extension')
- ext = umobj.group(1)
-
- info = {
- 'id': data['item_id'],
- 'url': video_url,
- 'uploader': data['display_name'],
- 'upload_date': upload_date,
- 'title': data['title'],
- 'ext': ext,
- 'format': data['media']['mimeType'],
- 'thumbnail': data['thumbnailUrl'],
- 'description': data['description'],
- 'player_url': data['embedUrl'],
- 'user_agent': 'iTunes/10.6.1',
- }
- except (ValueError,KeyError) as err:
- raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
-
- return [info]
-
-
-class MyVideoIE(InfoExtractor):
- """Information Extractor for myvideo.de."""
-
- _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
- IE_NAME = u'myvideo'
-
- # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
- # Released into the Public Domain by Tristan Fischer on 2013-05-19
- # https://github.com/rg3/youtube-dl/pull/842
- def __rc4crypt(self,data, key):
- x = 0
- box = list(range(256))
- for i in list(range(256)):
- x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
- box[i], box[x] = box[x], box[i]
- x = 0
- y = 0
- out = ''
- for char in data:
- x = (x + 1) % 256
- y = (y + box[x]) % 256
- box[x], box[y] = box[y], box[x]
- out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
- return out
-
- def __md5(self,s):
- return hashlib.md5(s).hexdigest().encode()
- def _real_extract(self,url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'invalid URL: %s' % url)
- video_id = mobj.group(1)
- GK = (
- b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
- b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
- b'TnpsbA0KTVRkbU1tSTRNdz09'
- )
-
- # Get video webpage
- webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
- webpage = self._download_webpage(webpage_url, video_id)
-
- mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
- if mobj is not None:
- self.report_extraction(video_id)
- video_url = mobj.group(1) + '.flv'
-
- video_title = self._html_search_regex('<title>([^<]+)</title>',
- webpage, u'title')
-
- video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': u'flv',
- }]
-
- # try encxml
- mobj = re.search('var flashvars={(.+?)}', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video')
-
- params = {}
- encxml = ''
- sec = mobj.group(1)
- for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
- if not a == '_encxml':
- params[a] = b
- else:
- encxml = compat_urllib_parse.unquote(b)
- if not params.get('domain'):
- params['domain'] = 'www.myvideo.de'
- xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
- if 'flash_playertype=MTV' in xmldata_url:
- self._downloader.report_warning(u'avoiding MTV player')
- xmldata_url = (
- 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
- '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
- ) % video_id
-
- # get enc data
- enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
- enc_data_b = binascii.unhexlify(enc_data)
- sk = self.__md5(
- base64.b64decode(base64.b64decode(GK)) +
- self.__md5(
- str(video_id).encode('utf-8')
- )
- )
- dec_data = self.__rc4crypt(enc_data_b, sk)
-
- # extracting infos
- self.report_extraction(video_id)
-
- video_url = None
- mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
- if mobj:
- video_url = compat_urllib_parse.unquote(mobj.group(1))
- if 'myvideo2flash' in video_url:
- self._downloader.report_warning(u'forcing RTMPT ...')
- video_url = video_url.replace('rtmpe://', 'rtmpt://')
-
- if not video_url:
- # extract non rtmp videos
- mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
- if mobj is None:
- raise ExtractorError(u'unable to extract url')
- video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
-
- video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
- video_file = compat_urllib_parse.unquote(video_file)
-
- if not video_file.endswith('f4m'):
- ppath, prefix = video_file.split('.')
- video_playpath = '%s:%s' % (prefix, ppath)
- video_hls_playlist = ''
- else:
- video_playpath = ''
- video_hls_playlist = (
- video_filepath + video_file
- ).replace('.f4m', '.m3u8')
-
- video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
- video_swfobj = compat_urllib_parse.unquote(video_swfobj)
-
- video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
- webpage, u'title')
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'tc_url': video_url,
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': u'flv',
- 'play_path': video_playpath,
- 'video_file': video_file,
- 'video_hls_playlist': video_hls_playlist,
- 'player_url': video_swfobj,
- }]
-
-
-class ComedyCentralIE(InfoExtractor):
- """Information extractor for The Daily Show and Colbert Report """
-
- # urls can be abbreviations like :thedailyshow or :colbert
- # urls for episodes like:
- # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
- # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
- # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
- _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
- |(https?://)?(www\.)?
- (?P<showname>thedailyshow|colbertnation)\.com/
- (full-episodes/(?P<episode>.*)|
- (?P<clip>
- (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
- |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
- $"""
-
- _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
-
- _video_extensions = {
- '3500': 'mp4',
- '2200': 'mp4',
- '1700': 'mp4',
- '1200': 'mp4',
- '750': 'mp4',
- '400': 'mp4',
- }
- _video_dimensions = {
- '3500': '1280x720',
- '2200': '960x540',
- '1700': '768x432',
- '1200': '640x360',
- '750': '512x288',
- '400': '384x216',
- }
-
- @classmethod
- def suitable(cls, url):
- """Receives a URL and returns True if suitable for this IE."""
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
-
- def _print_formats(self, formats):
- print('Available formats:')
- for x in formats:
- print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
-
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- if mobj.group('shortname'):
- if mobj.group('shortname') in ('tds', 'thedailyshow'):
- url = u'http://www.thedailyshow.com/full-episodes/'
- else:
- url = u'http://www.colbertnation.com/full-episodes/'
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- assert mobj is not None
-
- if mobj.group('clip'):
- if mobj.group('showname') == 'thedailyshow':
- epTitle = mobj.group('tdstitle')
- else:
- epTitle = mobj.group('cntitle')
- dlNewest = False
- else:
- dlNewest = not mobj.group('episode')
- if dlNewest:
- epTitle = mobj.group('showname')
- else:
- epTitle = mobj.group('episode')
-
- self.report_extraction(epTitle)
- webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
- if dlNewest:
- url = htmlHandle.geturl()
- mobj = re.match(self._VALID_URL, url, re.VERBOSE)
- if mobj is None:
- raise ExtractorError(u'Invalid redirected URL: ' + url)
- if mobj.group('episode') == '':
- raise ExtractorError(u'Redirected URL is still not specific: ' + url)
- epTitle = mobj.group('episode')
-
- mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
-
- if len(mMovieParams) == 0:
- # The Colbert Report embeds the information in a without
- # a URL prefix; so extract the alternate reference
- # and then add the URL prefix manually.
-
- altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
- if len(altMovieParams) == 0:
- raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
- else:
- mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
-
- uri = mMovieParams[0][1]
- indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
- indexXml = self._download_webpage(indexUrl, epTitle,
- u'Downloading show index',
- u'unable to download episode index')
-
- results = []
-
- idoc = xml.etree.ElementTree.fromstring(indexXml)
- itemEls = idoc.findall('.//item')
- for partNum,itemEl in enumerate(itemEls):
- mediaId = itemEl.findall('./guid')[0].text
- shortMediaId = mediaId.split(':')[-1]
- showId = mediaId.split(':')[-2].replace('.com', '')
- officialTitle = itemEl.findall('./title')[0].text
- officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
-
- configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
- compat_urllib_parse.urlencode({'uri': mediaId}))
- configXml = self._download_webpage(configUrl, epTitle,
- u'Downloading configuration for %s' % shortMediaId)
-
- cdoc = xml.etree.ElementTree.fromstring(configXml)
- turls = []
- for rendition in cdoc.findall('.//rendition'):
- finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
- turls.append(finfo)
-
- if len(turls) == 0:
- self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
- continue
-
- if self._downloader.params.get('listformats', None):
- self._print_formats([i[0] for i in turls])
- return
-
- # For now, just pick the highest bitrate
- format,rtmp_video_url = turls[-1]
-
- # Get the format arg from the arg stream
- req_format = self._downloader.params.get('format', None)
-
- # Select format if we can find one
- for f,v in turls:
- if f == req_format:
- format, rtmp_video_url = f, v
- break
-
- m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
- if not m:
- raise ExtractorError(u'Cannot transform RTMP url')
- base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
- video_url = base + m.group('finalid')
-
- effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
- info = {
- 'id': shortMediaId,
- 'url': video_url,
- 'uploader': showId,
- 'upload_date': officialDate,
- 'title': effTitle,
- 'ext': 'mp4',
- 'format': format,
- 'thumbnail': None,
- 'description': officialTitle,
- }
- results.append(info)
-
- return results
class EscapistIE(InfoExtractor):
}]
-class GooglePlusIE(InfoExtractor):
- """Information extractor for plus.google.com."""
-
- _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
- IE_NAME = u'plus.google'
-
- def _real_extract(self, url):
- # Extract id from URL
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
-
- post_url = mobj.group(0)
- video_id = mobj.group(1)
-
- video_extension = 'flv'
-
- # Step 1, Retrieve post webpage to extract further information
- webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
-
- self.report_extraction(video_id)
-
- # Extract update date
- upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
- webpage, u'upload date', fatal=False)
- if upload_date:
- # Convert timestring to a format suitable for filename
- upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
- upload_date = upload_date.strftime('%Y%m%d')
-
- # Extract uploader
- uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
- webpage, u'uploader', fatal=False)
-
- # Extract title
- # Get the first line for title
- video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
- webpage, 'title', default=u'NA')
-
- # Step 2, Stimulate clicking the image box to launch video
- video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
- webpage, u'video page URL')
- webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
-
- # Extract video links on video page
- """Extract video links of all sizes"""
- pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
- mobj = re.findall(pattern, webpage)
- if len(mobj) == 0:
- raise ExtractorError(u'Unable to extract video links')
-
- # Sort in resolution
- links = sorted(mobj)
-
- # Choose the lowest of the sort, i.e. highest resolution
- video_url = links[-1]
- # Only get the url. The resolution part in the tuple has no use anymore
- video_url = video_url[-1]
- # Treat escaped \u0026 style hex
- try:
- video_url = video_url.decode("unicode_escape")
- except AttributeError: # Python 3
- video_url = bytes(video_url, 'ascii').decode('unicode-escape')
-
-
- return [{
- 'id': video_id,
- 'url': video_url,
- 'uploader': uploader,
- 'upload_date': upload_date,
- 'title': video_title,
- 'ext': video_extension,
- }]
class NBAIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'