10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.dailymotion import DailymotionIE
24 from .extractor.gametrailers import GametrailersIE
25 from .extractor.generic import GenericIE
26 from .extractor.metacafe import MetacafeIE
27 from .extractor.statigram import StatigramIE
28 from .extractor.photobucket import PhotobucketIE
29 from .extractor.vimeo import VimeoIE
30 from .extractor.yahoo import YahooIE
31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
32 from .extractor.zdf import ZDFIE
48 class YahooSearchIE(SearchInfoExtractor):
49 """Information Extractor for Yahoo! Video search queries."""
52 IE_NAME = u'screen.yahoo:search'
53 _SEARCH_KEY = 'yvsearch'
55 def _get_n_results(self, query, n):
56 """Get a specified number of results for a query"""
63 for pagenum in itertools.count(0):
64 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
65 webpage = self._download_webpage(result_url, query,
66 note='Downloading results page '+str(pagenum+1))
67 info = json.loads(webpage)
69 results = info[u'results']
71 for (i, r) in enumerate(results):
72 if (pagenum * 30) +i >= n:
74 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
75 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
76 res['entries'].append(e)
77 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
83 class BlipTVUserIE(InfoExtractor):
84 """Information Extractor for blip.tv users."""
86 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
88 IE_NAME = u'blip.tv:user'
90 def _real_extract(self, url):
92 mobj = re.match(self._VALID_URL, url)
94 raise ExtractorError(u'Invalid URL: %s' % url)
96 username = mobj.group(1)
98 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
100 page = self._download_webpage(url, username, u'Downloading user page')
101 mobj = re.search(r'data-users-id="([^"]+)"', page)
102 page_base = page_base % mobj.group(1)
105 # Download video ids using BlipTV Ajax calls. Result size per
106 # query is limited (currently to 12 videos) so we need to query
107 # page by page until there are no video ids - it means we got
114 url = page_base + "&page=" + str(pagenum)
115 page = self._download_webpage(url, username,
116 u'Downloading video ids from page %d' % pagenum)
118 # Extract video identifiers
121 for mobj in re.finditer(r'href="/([^"]+)"', page):
122 if mobj.group(1) not in ids_in_page:
123 ids_in_page.append(unescapeHTML(mobj.group(1)))
125 video_ids.extend(ids_in_page)
127 # A little optimization - if current page is not
128 # "full", ie. does not contain PAGE_SIZE video ids then
129 # we can assume that this page is the last one - there
130 # are no more ids on further pages - no need to query
133 if len(ids_in_page) < self._PAGE_SIZE:
138 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
139 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
140 return [self.playlist_result(url_entries, playlist_title = username)]
143 class DepositFilesIE(InfoExtractor):
144 """Information extractor for depositfiles.com"""
146 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
148 def _real_extract(self, url):
149 file_id = url.split('/')[-1]
150 # Rebuild url in english locale
151 url = 'http://depositfiles.com/en/files/' + file_id
153 # Retrieve file webpage with 'Free download' button pressed
154 free_download_indication = { 'gateway_result' : '1' }
155 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
157 self.report_download_webpage(file_id)
158 webpage = compat_urllib_request.urlopen(request).read()
159 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
160 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
162 # Search for the real file URL
163 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
164 if (mobj is None) or (mobj.group(1) is None):
165 # Try to figure out reason of the error.
166 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
167 if (mobj is not None) and (mobj.group(1) is not None):
168 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
169 raise ExtractorError(u'%s' % restriction_message)
171 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
173 file_url = mobj.group(1)
174 file_extension = os.path.splitext(file_url)[1][1:]
176 # Search for file title
177 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
180 'id': file_id.decode('utf-8'),
181 'url': file_url.decode('utf-8'),
185 'ext': file_extension.decode('utf-8'),
189 class FacebookIE(InfoExtractor):
190 """Information Extractor for Facebook"""
192 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
193 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
194 _NETRC_MACHINE = 'facebook'
195 IE_NAME = u'facebook'
197 def report_login(self):
198 """Report attempt to log in."""
199 self.to_screen(u'Logging in')
201 def _real_initialize(self):
202 if self._downloader is None:
207 downloader_params = self._downloader.params
209 # Attempt to use provided username and password or .netrc data
210 if downloader_params.get('username', None) is not None:
211 useremail = downloader_params['username']
212 password = downloader_params['password']
213 elif downloader_params.get('usenetrc', False):
215 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
220 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
221 except (IOError, netrc.NetrcParseError) as err:
222 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
225 if useremail is None:
234 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
237 login_results = compat_urllib_request.urlopen(request).read()
238 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
239 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
241 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
242 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
245 def _real_extract(self, url):
246 mobj = re.match(self._VALID_URL, url)
248 raise ExtractorError(u'Invalid URL: %s' % url)
249 video_id = mobj.group('ID')
251 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
252 webpage = self._download_webpage(url, video_id)
254 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
255 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
256 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
258 raise ExtractorError(u'Cannot parse data')
259 data = dict(json.loads(m.group(1)))
260 params_raw = compat_urllib_parse.unquote(data['params'])
261 params = json.loads(params_raw)
262 video_data = params['video_data'][0]
263 video_url = video_data.get('hd_src')
265 video_url = video_data['sd_src']
267 raise ExtractorError(u'Cannot find video URL')
268 video_duration = int(video_data['video_duration'])
269 thumbnail = video_data['thumbnail_src']
271 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
276 'title': video_title,
279 'duration': video_duration,
280 'thumbnail': thumbnail,
285 class BlipTVIE(InfoExtractor):
286 """Information extractor for blip.tv"""
288 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
289 _URL_EXT = r'^.*\.([a-z0-9]+)$'
292 def report_direct_download(self, title):
293 """Report information extraction."""
294 self.to_screen(u'%s: Direct download detected' % title)
296 def _real_extract(self, url):
297 mobj = re.match(self._VALID_URL, url)
299 raise ExtractorError(u'Invalid URL: %s' % url)
301 # See https://github.com/rg3/youtube-dl/issues/857
302 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
303 if api_mobj is not None:
304 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
305 urlp = compat_urllib_parse_urlparse(url)
306 if urlp.path.startswith('/play/'):
307 request = compat_urllib_request.Request(url)
308 response = compat_urllib_request.urlopen(request)
309 redirecturl = response.geturl()
310 rurlp = compat_urllib_parse_urlparse(redirecturl)
311 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
312 url = 'http://blip.tv/a/a-' + file_id
313 return self._real_extract(url)
320 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
321 request = compat_urllib_request.Request(json_url)
322 request.add_header('User-Agent', 'iTunes/10.6.1')
323 self.report_extraction(mobj.group(1))
326 urlh = compat_urllib_request.urlopen(request)
327 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
328 basename = url.split('/')[-1]
329 title,ext = os.path.splitext(basename)
330 title = title.decode('UTF-8')
331 ext = ext.replace('.', '')
332 self.report_direct_download(title)
342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
343 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
344 if info is None: # Regular URL
346 json_code_bytes = urlh.read()
347 json_code = json_code_bytes.decode('utf-8')
348 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
349 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
352 json_data = json.loads(json_code)
353 if 'Post' in json_data:
354 data = json_data['Post']
358 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
359 video_url = data['media']['url']
360 umobj = re.match(self._URL_EXT, video_url)
362 raise ValueError('Can not determine filename extension')
366 'id': data['item_id'],
368 'uploader': data['display_name'],
369 'upload_date': upload_date,
370 'title': data['title'],
372 'format': data['media']['mimeType'],
373 'thumbnail': data['thumbnailUrl'],
374 'description': data['description'],
375 'player_url': data['embedUrl'],
376 'user_agent': 'iTunes/10.6.1',
378 except (ValueError,KeyError) as err:
379 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
384 class MyVideoIE(InfoExtractor):
385 """Information Extractor for myvideo.de."""
387 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
390 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
391 # Released into the Public Domain by Tristan Fischer on 2013-05-19
392 # https://github.com/rg3/youtube-dl/pull/842
393 def __rc4crypt(self,data, key):
395 box = list(range(256))
396 for i in list(range(256)):
397 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
398 box[i], box[x] = box[x], box[i]
404 y = (y + box[x]) % 256
405 box[x], box[y] = box[y], box[x]
406 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
410 return hashlib.md5(s).hexdigest().encode()
412 def _real_extract(self,url):
413 mobj = re.match(self._VALID_URL, url)
415 raise ExtractorError(u'invalid URL: %s' % url)
417 video_id = mobj.group(1)
420 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
421 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
422 b'TnpsbA0KTVRkbU1tSTRNdz09'
426 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
427 webpage = self._download_webpage(webpage_url, video_id)
429 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
431 self.report_extraction(video_id)
432 video_url = mobj.group(1) + '.flv'
434 video_title = self._html_search_regex('<title>([^<]+)</title>',
437 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
444 'title': video_title,
449 mobj = re.search('var flashvars={(.+?)}', webpage)
451 raise ExtractorError(u'Unable to extract video')
456 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
457 if not a == '_encxml':
460 encxml = compat_urllib_parse.unquote(b)
461 if not params.get('domain'):
462 params['domain'] = 'www.myvideo.de'
463 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
464 if 'flash_playertype=MTV' in xmldata_url:
465 self._downloader.report_warning(u'avoiding MTV player')
467 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
468 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
472 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
473 enc_data_b = binascii.unhexlify(enc_data)
475 base64.b64decode(base64.b64decode(GK)) +
477 str(video_id).encode('utf-8')
480 dec_data = self.__rc4crypt(enc_data_b, sk)
483 self.report_extraction(video_id)
486 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
488 video_url = compat_urllib_parse.unquote(mobj.group(1))
489 if 'myvideo2flash' in video_url:
490 self._downloader.report_warning(u'forcing RTMPT ...')
491 video_url = video_url.replace('rtmpe://', 'rtmpt://')
494 # extract non rtmp videos
495 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
497 raise ExtractorError(u'unable to extract url')
498 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
500 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
501 video_file = compat_urllib_parse.unquote(video_file)
503 if not video_file.endswith('f4m'):
504 ppath, prefix = video_file.split('.')
505 video_playpath = '%s:%s' % (prefix, ppath)
506 video_hls_playlist = ''
509 video_hls_playlist = (
510 video_filepath + video_file
511 ).replace('.f4m', '.m3u8')
513 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
514 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
516 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
525 'title': video_title,
527 'play_path': video_playpath,
528 'video_file': video_file,
529 'video_hls_playlist': video_hls_playlist,
530 'player_url': video_swfobj,
534 class ComedyCentralIE(InfoExtractor):
535 """Information extractor for The Daily Show and Colbert Report """
537 # urls can be abbreviations like :thedailyshow or :colbert
538 # urls for episodes like:
539 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
540 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
541 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
542 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
543 |(https?://)?(www\.)?
544 (?P<showname>thedailyshow|colbertnation)\.com/
545 (full-episodes/(?P<episode>.*)|
547 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
548 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
551 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
553 _video_extensions = {
561 _video_dimensions = {
571 def suitable(cls, url):
572 """Receives a URL and returns True if suitable for this IE."""
573 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
575 def _print_formats(self, formats):
576 print('Available formats:')
578 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
581 def _real_extract(self, url):
582 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
584 raise ExtractorError(u'Invalid URL: %s' % url)
586 if mobj.group('shortname'):
587 if mobj.group('shortname') in ('tds', 'thedailyshow'):
588 url = u'http://www.thedailyshow.com/full-episodes/'
590 url = u'http://www.colbertnation.com/full-episodes/'
591 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
592 assert mobj is not None
594 if mobj.group('clip'):
595 if mobj.group('showname') == 'thedailyshow':
596 epTitle = mobj.group('tdstitle')
598 epTitle = mobj.group('cntitle')
601 dlNewest = not mobj.group('episode')
603 epTitle = mobj.group('showname')
605 epTitle = mobj.group('episode')
607 self.report_extraction(epTitle)
608 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
610 url = htmlHandle.geturl()
611 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
613 raise ExtractorError(u'Invalid redirected URL: ' + url)
614 if mobj.group('episode') == '':
615 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
616 epTitle = mobj.group('episode')
618 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
620 if len(mMovieParams) == 0:
621 # The Colbert Report embeds the information in a without
622 # a URL prefix; so extract the alternate reference
623 # and then add the URL prefix manually.
625 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
626 if len(altMovieParams) == 0:
627 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
629 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
631 uri = mMovieParams[0][1]
632 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
633 indexXml = self._download_webpage(indexUrl, epTitle,
634 u'Downloading show index',
635 u'unable to download episode index')
639 idoc = xml.etree.ElementTree.fromstring(indexXml)
640 itemEls = idoc.findall('.//item')
641 for partNum,itemEl in enumerate(itemEls):
642 mediaId = itemEl.findall('./guid')[0].text
643 shortMediaId = mediaId.split(':')[-1]
644 showId = mediaId.split(':')[-2].replace('.com', '')
645 officialTitle = itemEl.findall('./title')[0].text
646 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
648 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
649 compat_urllib_parse.urlencode({'uri': mediaId}))
650 configXml = self._download_webpage(configUrl, epTitle,
651 u'Downloading configuration for %s' % shortMediaId)
653 cdoc = xml.etree.ElementTree.fromstring(configXml)
655 for rendition in cdoc.findall('.//rendition'):
656 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
660 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
663 if self._downloader.params.get('listformats', None):
664 self._print_formats([i[0] for i in turls])
667 # For now, just pick the highest bitrate
668 format,rtmp_video_url = turls[-1]
670 # Get the format arg from the arg stream
671 req_format = self._downloader.params.get('format', None)
673 # Select format if we can find one
676 format, rtmp_video_url = f, v
679 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
681 raise ExtractorError(u'Cannot transform RTMP url')
682 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
683 video_url = base + m.group('finalid')
685 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
690 'upload_date': officialDate,
695 'description': officialTitle,
702 class EscapistIE(InfoExtractor):
703 """Information extractor for The Escapist """
705 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
706 IE_NAME = u'escapist'
708 def _real_extract(self, url):
709 mobj = re.match(self._VALID_URL, url)
711 raise ExtractorError(u'Invalid URL: %s' % url)
712 showName = mobj.group('showname')
713 videoId = mobj.group('episode')
715 self.report_extraction(videoId)
716 webpage = self._download_webpage(url, videoId)
718 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
719 webpage, u'description', fatal=False)
721 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
722 webpage, u'thumbnail', fatal=False)
724 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
725 webpage, u'player url')
727 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
728 webpage, u'player url').split(' : ')[-1]
730 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
731 configUrl = compat_urllib_parse.unquote(configUrl)
733 configJSON = self._download_webpage(configUrl, videoId,
734 u'Downloading configuration',
735 u'unable to download configuration')
737 # Technically, it's JavaScript, not JSON
738 configJSON = configJSON.replace("'", '"')
741 config = json.loads(configJSON)
742 except (ValueError,) as err:
743 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
745 playlist = config['playlist']
746 videoUrl = playlist[1]['url']
751 'uploader': showName,
756 'description': videoDesc,
757 'player_url': playerUrl,
762 class CollegeHumorIE(InfoExtractor):
763 """Information extractor for collegehumor.com"""
766 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
767 IE_NAME = u'collegehumor'
769 def report_manifest(self, video_id):
770 """Report information extraction."""
771 self.to_screen(u'%s: Downloading XML manifest' % video_id)
773 def _real_extract(self, url):
774 mobj = re.match(self._VALID_URL, url)
776 raise ExtractorError(u'Invalid URL: %s' % url)
777 video_id = mobj.group('videoid')
785 self.report_extraction(video_id)
786 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
788 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
789 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
790 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
792 mdoc = xml.etree.ElementTree.fromstring(metaXml)
794 videoNode = mdoc.findall('./video')[0]
795 info['description'] = videoNode.findall('./description')[0].text
796 info['title'] = videoNode.findall('./caption')[0].text
797 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
798 manifest_url = videoNode.findall('./file')[0].text
800 raise ExtractorError(u'Invalid metadata XML file')
802 manifest_url += '?hdcore=2.10.3'
803 self.report_manifest(video_id)
805 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
806 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
807 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
809 adoc = xml.etree.ElementTree.fromstring(manifestXml)
811 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
812 node_id = media_node.attrib['url']
813 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
814 except IndexError as err:
815 raise ExtractorError(u'Invalid manifest file')
817 url_pr = compat_urllib_parse_urlparse(manifest_url)
818 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
825 class XVideosIE(InfoExtractor):
826 """Information extractor for xvideos.com"""
828 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
831 def _real_extract(self, url):
832 mobj = re.match(self._VALID_URL, url)
834 raise ExtractorError(u'Invalid URL: %s' % url)
835 video_id = mobj.group(1)
837 webpage = self._download_webpage(url, video_id)
839 self.report_extraction(video_id)
842 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
843 webpage, u'video URL'))
846 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
849 # Extract video thumbnail
850 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
851 webpage, u'thumbnail', fatal=False)
858 'title': video_title,
860 'thumbnail': video_thumbnail,
867 class SoundcloudIE(InfoExtractor):
868 """Information extractor for soundcloud.com
869 To access the media, the uid of the song and a stream token
870 must be extracted from the page source and the script must make
871 a request to media.soundcloud.com/crossdomain.xml. Then
872 the media can be grabbed by requesting from an url composed
873 of the stream token and uid
876 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
877 IE_NAME = u'soundcloud'
879 def report_resolve(self, video_id):
880 """Report information extraction."""
881 self.to_screen(u'%s: Resolving id' % video_id)
883 def _real_extract(self, url):
884 mobj = re.match(self._VALID_URL, url)
886 raise ExtractorError(u'Invalid URL: %s' % url)
888 # extract uploader (which is in the url)
889 uploader = mobj.group(1)
890 # extract simple title (uploader + slug of song title)
891 slug_title = mobj.group(2)
892 simple_title = uploader + u'-' + slug_title
893 full_title = '%s/%s' % (uploader, slug_title)
895 self.report_resolve(full_title)
897 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
898 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
899 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
901 info = json.loads(info_json)
902 video_id = info['id']
903 self.report_extraction(full_title)
905 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
906 stream_json = self._download_webpage(streams_url, full_title,
907 u'Downloading stream definitions',
908 u'unable to download stream definitions')
910 streams = json.loads(stream_json)
911 mediaURL = streams['http_mp3_128_url']
912 upload_date = unified_strdate(info['created_at'])
917 'uploader': info['user']['username'],
918 'upload_date': upload_date,
919 'title': info['title'],
921 'description': info['description'],
924 class SoundcloudSetIE(InfoExtractor):
925 """Information extractor for soundcloud.com sets
926 To access the media, the uid of the song and a stream token
927 must be extracted from the page source and the script must make
928 a request to media.soundcloud.com/crossdomain.xml. Then
929 the media can be grabbed by requesting from an url composed
930 of the stream token and uid
933 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
934 IE_NAME = u'soundcloud:set'
936 def report_resolve(self, video_id):
937 """Report information extraction."""
938 self.to_screen(u'%s: Resolving id' % video_id)
940 def _real_extract(self, url):
941 mobj = re.match(self._VALID_URL, url)
943 raise ExtractorError(u'Invalid URL: %s' % url)
945 # extract uploader (which is in the url)
946 uploader = mobj.group(1)
947 # extract simple title (uploader + slug of song title)
948 slug_title = mobj.group(2)
949 simple_title = uploader + u'-' + slug_title
950 full_title = '%s/sets/%s' % (uploader, slug_title)
952 self.report_resolve(full_title)
954 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
955 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
956 info_json = self._download_webpage(resolv_url, full_title)
959 info = json.loads(info_json)
961 for err in info['errors']:
962 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
965 self.report_extraction(full_title)
966 for track in info['tracks']:
967 video_id = track['id']
969 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
970 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
972 self.report_extraction(video_id)
973 streams = json.loads(stream_json)
974 mediaURL = streams['http_mp3_128_url']
979 'uploader': track['user']['username'],
980 'upload_date': unified_strdate(track['created_at']),
981 'title': track['title'],
983 'description': track['description'],
988 class InfoQIE(InfoExtractor):
989 """Information extractor for infoq.com"""
990 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
992 def _real_extract(self, url):
993 mobj = re.match(self._VALID_URL, url)
995 raise ExtractorError(u'Invalid URL: %s' % url)
997 webpage = self._download_webpage(url, video_id=url)
998 self.report_extraction(url)
1001 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1003 raise ExtractorError(u'Unable to extract video url')
1004 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1005 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1008 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1011 # Extract description
1012 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1013 webpage, u'description', fatal=False)
1015 video_filename = video_url.split('/')[-1]
1016 video_id, extension = video_filename.split('.')
1022 'upload_date': None,
1023 'title': video_title,
1024 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1026 'description': video_description,
1031 class MixcloudIE(InfoExtractor):
1032 """Information extractor for www.mixcloud.com"""
1034 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1035 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1036 IE_NAME = u'mixcloud'
1038 def report_download_json(self, file_id):
1039 """Report JSON download."""
1040 self.to_screen(u'Downloading json')
1042 def get_urls(self, jsonData, fmt, bitrate='best'):
1043 """Get urls from 'audio_formats' section in json"""
1046 bitrate_list = jsonData[fmt]
1047 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1048 bitrate = max(bitrate_list) # select highest
1050 url_list = jsonData[fmt][bitrate]
1051 except TypeError: # we have no bitrate info.
1052 url_list = jsonData[fmt]
1055 def check_urls(self, url_list):
1056 """Returns 1st active url from list"""
1057 for url in url_list:
1059 compat_urllib_request.urlopen(url)
1061 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1066 def _print_formats(self, formats):
1067 print('Available formats:')
1068 for fmt in formats.keys():
1069 for b in formats[fmt]:
1071 ext = formats[fmt][b][0]
1072 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1073 except TypeError: # we have no bitrate info
1074 ext = formats[fmt][0]
1075 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1078 def _real_extract(self, url):
1079 mobj = re.match(self._VALID_URL, url)
1081 raise ExtractorError(u'Invalid URL: %s' % url)
1082 # extract uploader & filename from url
1083 uploader = mobj.group(1).decode('utf-8')
1084 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1086 # construct API request
1087 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1088 # retrieve .json file with links to files
1089 request = compat_urllib_request.Request(file_url)
1091 self.report_download_json(file_url)
1092 jsonData = compat_urllib_request.urlopen(request).read()
1093 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1094 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1097 json_data = json.loads(jsonData)
1098 player_url = json_data['player_swf_url']
1099 formats = dict(json_data['audio_formats'])
1101 req_format = self._downloader.params.get('format', None)
1104 if self._downloader.params.get('listformats', None):
1105 self._print_formats(formats)
1108 if req_format is None or req_format == 'best':
1109 for format_param in formats.keys():
1110 url_list = self.get_urls(formats, format_param)
1112 file_url = self.check_urls(url_list)
1113 if file_url is not None:
1116 if req_format not in formats:
1117 raise ExtractorError(u'Format is not available')
1119 url_list = self.get_urls(formats, req_format)
1120 file_url = self.check_urls(url_list)
1121 format_param = req_format
1124 'id': file_id.decode('utf-8'),
1125 'url': file_url.decode('utf-8'),
1126 'uploader': uploader.decode('utf-8'),
1127 'upload_date': None,
1128 'title': json_data['name'],
1129 'ext': file_url.split('.')[-1].decode('utf-8'),
1130 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1131 'thumbnail': json_data['thumbnail_url'],
1132 'description': json_data['description'],
1133 'player_url': player_url.decode('utf-8'),
1136 class StanfordOpenClassroomIE(InfoExtractor):
1137 """Information extractor for Stanford's Open ClassRoom"""
1139 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1140 IE_NAME = u'stanfordoc'
1142 def _real_extract(self, url):
1143 mobj = re.match(self._VALID_URL, url)
1145 raise ExtractorError(u'Invalid URL: %s' % url)
1147 if mobj.group('course') and mobj.group('video'): # A specific video
1148 course = mobj.group('course')
1149 video = mobj.group('video')
1151 'id': course + '_' + video,
1153 'upload_date': None,
1156 self.report_extraction(info['id'])
1157 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1158 xmlUrl = baseUrl + video + '.xml'
1160 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1163 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1165 info['title'] = mdoc.findall('./title')[0].text
1166 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1168 raise ExtractorError(u'Invalid metadata XML file')
1169 info['ext'] = info['url'].rpartition('.')[2]
1171 elif mobj.group('course'): # A course page
1172 course = mobj.group('course')
1177 'upload_date': None,
1180 coursepage = self._download_webpage(url, info['id'],
1181 note='Downloading course info page',
1182 errnote='Unable to download course info page')
1184 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1186 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1187 coursepage, u'description', fatal=False)
1189 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1192 'type': 'reference',
1193 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1197 for entry in info['list']:
1198 assert entry['type'] == 'reference'
1199 results += self.extract(entry['url'])
1203 'id': 'Stanford OpenClassroom',
1206 'upload_date': None,
1209 self.report_download_webpage(info['id'])
1210 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1212 rootpage = compat_urllib_request.urlopen(rootURL).read()
1213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1214 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1216 info['title'] = info['id']
1218 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1221 'type': 'reference',
1222 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1227 for entry in info['list']:
1228 assert entry['type'] == 'reference'
1229 results += self.extract(entry['url'])
1232 class MTVIE(InfoExtractor):
1233 """Information extractor for MTV.com"""
1235 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1238 def _real_extract(self, url):
1239 mobj = re.match(self._VALID_URL, url)
1241 raise ExtractorError(u'Invalid URL: %s' % url)
1242 if not mobj.group('proto'):
1243 url = 'http://' + url
1244 video_id = mobj.group('videoid')
1246 webpage = self._download_webpage(url, video_id)
1248 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1249 webpage, u'song name', fatal=False)
1251 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1254 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1255 webpage, u'mtvn_uri', fatal=False)
1257 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1258 webpage, u'content id', fatal=False)
1260 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1261 self.report_extraction(video_id)
1262 request = compat_urllib_request.Request(videogen_url)
1264 metadataXml = compat_urllib_request.urlopen(request).read()
1265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1268 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1269 renditions = mdoc.findall('.//rendition')
1271 # For now, always pick the highest quality.
1272 rendition = renditions[-1]
1275 _,_,ext = rendition.attrib['type'].partition('/')
1276 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1277 video_url = rendition.find('./src').text
1279 raise ExtractorError('Invalid rendition field.')
1284 'uploader': performer,
1285 'upload_date': None,
1286 'title': video_title,
1294 class YoukuIE(InfoExtractor):
1295 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1298 nowTime = int(time.time() * 1000)
1299 random1 = random.randint(1000,1998)
1300 random2 = random.randint(1000,9999)
1302 return "%d%d%d" %(nowTime,random1,random2)
1304 def _get_file_ID_mix_string(self, seed):
1306 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1308 for i in range(len(source)):
1309 seed = (seed * 211 + 30031 ) % 65536
1310 index = math.floor(seed / 65536 * len(source) )
1311 mixed.append(source[int(index)])
1312 source.remove(source[int(index)])
1313 #return ''.join(mixed)
1316 def _get_file_id(self, fileId, seed):
1317 mixed = self._get_file_ID_mix_string(seed)
1318 ids = fileId.split('*')
1322 realId.append(mixed[int(ch)])
1323 return ''.join(realId)
1325 def _real_extract(self, url):
1326 mobj = re.match(self._VALID_URL, url)
1328 raise ExtractorError(u'Invalid URL: %s' % url)
1329 video_id = mobj.group('ID')
1331 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1333 jsondata = self._download_webpage(info_url, video_id)
1335 self.report_extraction(video_id)
1337 config = json.loads(jsondata)
1339 video_title = config['data'][0]['title']
1340 seed = config['data'][0]['seed']
1342 format = self._downloader.params.get('format', None)
1343 supported_format = list(config['data'][0]['streamfileids'].keys())
1345 if format is None or format == 'best':
1346 if 'hd2' in supported_format:
1351 elif format == 'worst':
1359 fileid = config['data'][0]['streamfileids'][format]
1360 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1361 except (UnicodeDecodeError, ValueError, KeyError):
1362 raise ExtractorError(u'Unable to extract info section')
1365 sid = self._gen_sid()
1366 fileid = self._get_file_id(fileid, seed)
1368 #column 8,9 of fileid represent the segment number
1369 #fileid[7:9] should be changed
1370 for index, key in enumerate(keys):
1372 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1373 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1376 'id': '%s_part%02d' % (video_id, index),
1377 'url': download_url,
1379 'upload_date': None,
1380 'title': video_title,
1383 files_info.append(info)
1388 class XNXXIE(InfoExtractor):
1389 """Information extractor for xnxx.com"""
1391 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1393 VIDEO_URL_RE = r'flv_url=(.*?)&'
1394 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1395 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1397 def _real_extract(self, url):
1398 mobj = re.match(self._VALID_URL, url)
1400 raise ExtractorError(u'Invalid URL: %s' % url)
1401 video_id = mobj.group(1)
1403 # Get webpage content
1404 webpage = self._download_webpage(url, video_id)
1406 video_url = self._search_regex(self.VIDEO_URL_RE,
1407 webpage, u'video URL')
1408 video_url = compat_urllib_parse.unquote(video_url)
1410 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1413 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1414 webpage, u'thumbnail', fatal=False)
1420 'upload_date': None,
1421 'title': video_title,
1423 'thumbnail': video_thumbnail,
1424 'description': None,
1428 class GooglePlusIE(InfoExtractor):
1429 """Information extractor for plus.google.com."""
1431 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1432 IE_NAME = u'plus.google'
1434 def _real_extract(self, url):
1435 # Extract id from URL
1436 mobj = re.match(self._VALID_URL, url)
1438 raise ExtractorError(u'Invalid URL: %s' % url)
1440 post_url = mobj.group(0)
1441 video_id = mobj.group(1)
1443 video_extension = 'flv'
1445 # Step 1, Retrieve post webpage to extract further information
1446 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1448 self.report_extraction(video_id)
1450 # Extract update date
1451 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1452 webpage, u'upload date', fatal=False)
1454 # Convert timestring to a format suitable for filename
1455 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1456 upload_date = upload_date.strftime('%Y%m%d')
1459 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1460 webpage, u'uploader', fatal=False)
1463 # Get the first line for title
1464 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1465 webpage, 'title', default=u'NA')
1467 # Step 2, Stimulate clicking the image box to launch video
1468 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1469 webpage, u'video page URL')
1470 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1472 # Extract video links on video page
1473 """Extract video links of all sizes"""
1474 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1475 mobj = re.findall(pattern, webpage)
1477 raise ExtractorError(u'Unable to extract video links')
1479 # Sort in resolution
1480 links = sorted(mobj)
1482 # Choose the lowest of the sort, i.e. highest resolution
1483 video_url = links[-1]
1484 # Only get the url. The resolution part in the tuple has no use anymore
1485 video_url = video_url[-1]
1486 # Treat escaped \u0026 style hex
1488 video_url = video_url.decode("unicode_escape")
1489 except AttributeError: # Python 3
1490 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1496 'uploader': uploader,
1497 'upload_date': upload_date,
1498 'title': video_title,
1499 'ext': video_extension,
1502 class NBAIE(InfoExtractor):
1503 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1506 def _real_extract(self, url):
1507 mobj = re.match(self._VALID_URL, url)
1509 raise ExtractorError(u'Invalid URL: %s' % url)
1511 video_id = mobj.group(1)
1513 webpage = self._download_webpage(url, video_id)
1515 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1517 shortened_video_id = video_id.rpartition('/')[2]
1518 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1519 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1521 # It isn't there in the HTML it returns to us
1522 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1524 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1527 'id': shortened_video_id,
1531 # 'uploader_date': uploader_date,
1532 'description': description,
1536 class JustinTVIE(InfoExtractor):
1537 """Information extractor for justin.tv and twitch.tv"""
1538 # TODO: One broadcast may be split into multiple videos. The key
1539 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1540 # starts at 1 and increases. Can we treat all parts as one video?
1542 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1544 (?P<channelid>[^/]+)|
1545 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1546 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1550 _JUSTIN_PAGE_LIMIT = 100
1551 IE_NAME = u'justin.tv'
1553 def report_download_page(self, channel, offset):
1554 """Report attempt to download a single page of videos."""
1555 self.to_screen(u'%s: Downloading video information from %d to %d' %
1556 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1558 # Return count of items, list of *valid* items
1559 def _parse_page(self, url, video_id):
1560 webpage = self._download_webpage(url, video_id,
1561 u'Downloading video info JSON',
1562 u'unable to download video info JSON')
1564 response = json.loads(webpage)
1565 if type(response) != list:
1566 error_text = response.get('error', 'unknown error')
1567 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1569 for clip in response:
1570 video_url = clip['video_file_url']
1572 video_extension = os.path.splitext(video_url)[1][1:]
1573 video_date = re.sub('-', '', clip['start_time'][:10])
1574 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1575 video_id = clip['id']
1576 video_title = clip.get('title', video_id)
1580 'title': video_title,
1581 'uploader': clip.get('channel_name', video_uploader_id),
1582 'uploader_id': video_uploader_id,
1583 'upload_date': video_date,
1584 'ext': video_extension,
1586 return (len(response), info)
1588 def _real_extract(self, url):
1589 mobj = re.match(self._VALID_URL, url)
1591 raise ExtractorError(u'invalid URL: %s' % url)
1593 api_base = 'http://api.justin.tv'
1595 if mobj.group('channelid'):
1597 video_id = mobj.group('channelid')
1598 api = api_base + '/channel/archives/%s.json' % video_id
1599 elif mobj.group('chapterid'):
1600 chapter_id = mobj.group('chapterid')
1602 webpage = self._download_webpage(url, chapter_id)
1603 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1605 raise ExtractorError(u'Cannot find archive of a chapter')
1606 archive_id = m.group(1)
1608 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1609 chapter_info_xml = self._download_webpage(api, chapter_id,
1610 note=u'Downloading chapter information',
1611 errnote=u'Chapter information download failed')
1612 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1613 for a in doc.findall('.//archive'):
1614 if archive_id == a.find('./id').text:
1617 raise ExtractorError(u'Could not find chapter in chapter information')
1619 video_url = a.find('./video_file_url').text
1620 video_ext = video_url.rpartition('.')[2] or u'flv'
1622 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1623 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1624 note='Downloading chapter metadata',
1625 errnote='Download of chapter metadata failed')
1626 chapter_info = json.loads(chapter_info_json)
1628 bracket_start = int(doc.find('.//bracket_start').text)
1629 bracket_end = int(doc.find('.//bracket_end').text)
1631 # TODO determine start (and probably fix up file)
1632 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1633 #video_url += u'?start=' + TODO:start_timestamp
1634 # bracket_start is 13290, but we want 51670615
1635 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1636 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1639 'id': u'c' + chapter_id,
1642 'title': chapter_info['title'],
1643 'thumbnail': chapter_info['preview'],
1644 'description': chapter_info['description'],
1645 'uploader': chapter_info['channel']['display_name'],
1646 'uploader_id': chapter_info['channel']['name'],
1650 video_id = mobj.group('videoid')
1651 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1653 self.report_extraction(video_id)
1657 limit = self._JUSTIN_PAGE_LIMIT
1660 self.report_download_page(video_id, offset)
1661 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1662 page_count, page_info = self._parse_page(page_url, video_id)
1663 info.extend(page_info)
1664 if not paged or page_count != limit:
1669 class FunnyOrDieIE(InfoExtractor):
1670 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1672 def _real_extract(self, url):
1673 mobj = re.match(self._VALID_URL, url)
1675 raise ExtractorError(u'invalid URL: %s' % url)
1677 video_id = mobj.group('id')
1678 webpage = self._download_webpage(url, video_id)
1680 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1681 webpage, u'video URL', flags=re.DOTALL)
1683 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1684 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1686 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1687 webpage, u'description', fatal=False, flags=re.DOTALL)
1694 'description': video_description,
1698 class SteamIE(InfoExtractor):
1699 _VALID_URL = r"""http://store\.steampowered\.com/
1701 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1703 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1705 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1706 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1709 def suitable(cls, url):
1710 """Receives a URL and returns True if suitable for this IE."""
1711 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1713 def _real_extract(self, url):
1714 m = re.match(self._VALID_URL, url, re.VERBOSE)
1715 gameID = m.group('gameID')
1717 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1718 webpage = self._download_webpage(videourl, gameID)
1720 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1721 videourl = self._AGECHECK_TEMPLATE % gameID
1722 self.report_age_confirmation()
1723 webpage = self._download_webpage(videourl, gameID)
1725 self.report_extraction(gameID)
1726 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1727 webpage, 'game title')
1729 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1730 mweb = re.finditer(urlRE, webpage)
1731 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1732 titles = re.finditer(namesRE, webpage)
1733 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1734 thumbs = re.finditer(thumbsRE, webpage)
1736 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1737 video_id = vid.group('videoID')
1738 title = vtitle.group('videoName')
1739 video_url = vid.group('videoURL')
1740 video_thumb = thumb.group('thumbnail')
1742 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1747 'title': unescapeHTML(title),
1748 'thumbnail': video_thumb
1751 return [self.playlist_result(videos, gameID, game_title)]
1753 class UstreamIE(InfoExtractor):
1754 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1755 IE_NAME = u'ustream'
1757 def _real_extract(self, url):
1758 m = re.match(self._VALID_URL, url)
1759 video_id = m.group('videoID')
1761 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1762 webpage = self._download_webpage(url, video_id)
1764 self.report_extraction(video_id)
1766 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1769 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1770 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1772 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1773 webpage, u'thumbnail', fatal=False)
1779 'title': video_title,
1780 'uploader': uploader,
1781 'thumbnail': thumbnail,
1785 class WorldStarHipHopIE(InfoExtractor):
1786 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1787 IE_NAME = u'WorldStarHipHop'
1789 def _real_extract(self, url):
1790 m = re.match(self._VALID_URL, url)
1791 video_id = m.group('id')
1793 webpage_src = self._download_webpage(url, video_id)
1795 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1796 webpage_src, u'video URL')
1798 if 'mp4' in video_url:
1803 video_title = self._html_search_regex(r"<title>(.*)</title>",
1804 webpage_src, u'title')
1806 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1807 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1808 webpage_src, u'thumbnail', fatal=False)
1811 _title = r"""candytitles.*>(.*)</span>"""
1812 mobj = re.search(_title, webpage_src)
1813 if mobj is not None:
1814 video_title = mobj.group(1)
1819 'title' : video_title,
1820 'thumbnail' : thumbnail,
1825 class RBMARadioIE(InfoExtractor):
1826 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1828 def _real_extract(self, url):
1829 m = re.match(self._VALID_URL, url)
1830 video_id = m.group('videoID')
1832 webpage = self._download_webpage(url, video_id)
1834 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1835 webpage, u'json data', flags=re.MULTILINE)
1838 data = json.loads(json_data)
1839 except ValueError as e:
1840 raise ExtractorError(u'Invalid JSON: ' + str(e))
1842 video_url = data['akamai_url'] + '&cbr=256'
1843 url_parts = compat_urllib_parse_urlparse(video_url)
1844 video_ext = url_parts.path.rpartition('.')[2]
1849 'title': data['title'],
1850 'description': data.get('teaser_text'),
1851 'location': data.get('country_of_origin'),
1852 'uploader': data.get('host', {}).get('name'),
1853 'uploader_id': data.get('host', {}).get('slug'),
1854 'thumbnail': data.get('image', {}).get('large_url_2x'),
1855 'duration': data.get('duration'),
1860 class YouPornIE(InfoExtractor):
1861 """Information extractor for youporn.com."""
1862 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1864 def _print_formats(self, formats):
1865 """Print all available formats"""
1866 print(u'Available formats:')
1867 print(u'ext\t\tformat')
1868 print(u'---------------------------------')
1869 for format in formats:
1870 print(u'%s\t\t%s' % (format['ext'], format['format']))
1872 def _specific(self, req_format, formats):
1874 if(x["format"]==req_format):
1878 def _real_extract(self, url):
1879 mobj = re.match(self._VALID_URL, url)
1881 raise ExtractorError(u'Invalid URL: %s' % url)
1882 video_id = mobj.group('videoid')
1884 req = compat_urllib_request.Request(url)
1885 req.add_header('Cookie', 'age_verified=1')
1886 webpage = self._download_webpage(req, video_id)
1888 # Get JSON parameters
1889 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1891 params = json.loads(json_params)
1893 raise ExtractorError(u'Invalid JSON')
1895 self.report_extraction(video_id)
1897 video_title = params['title']
1898 upload_date = unified_strdate(params['release_date_f'])
1899 video_description = params['description']
1900 video_uploader = params['submitted_by']
1901 thumbnail = params['thumbnails'][0]['image']
1903 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1905 # Get all of the formats available
1906 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1907 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1908 webpage, u'download list').strip()
1910 # Get all of the links from the page
1911 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1912 links = re.findall(LINK_RE, download_list_html)
1913 if(len(links) == 0):
1914 raise ExtractorError(u'ERROR: no known formats available for video')
1916 self.to_screen(u'Links found: %d' % len(links))
1921 # A link looks like this:
1922 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1923 # A path looks like this:
1924 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1925 video_url = unescapeHTML( link )
1926 path = compat_urllib_parse_urlparse( video_url ).path
1927 extension = os.path.splitext( path )[1][1:]
1928 format = path.split('/')[4].split('_')[:2]
1931 format = "-".join( format )
1932 # title = u'%s-%s-%s' % (video_title, size, bitrate)
1937 'uploader': video_uploader,
1938 'upload_date': upload_date,
1939 'title': video_title,
1942 'thumbnail': thumbnail,
1943 'description': video_description
1946 if self._downloader.params.get('listformats', None):
1947 self._print_formats(formats)
1950 req_format = self._downloader.params.get('format', None)
1951 self.to_screen(u'Format: %s' % req_format)
1953 if req_format is None or req_format == 'best':
1955 elif req_format == 'worst':
1956 return [formats[-1]]
1957 elif req_format in ('-1', 'all'):
1960 format = self._specific( req_format, formats )
1962 raise ExtractorError(u'Requested format not available')
1967 class PornotubeIE(InfoExtractor):
1968 """Information extractor for pornotube.com."""
1969 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1971 def _real_extract(self, url):
1972 mobj = re.match(self._VALID_URL, url)
1974 raise ExtractorError(u'Invalid URL: %s' % url)
1976 video_id = mobj.group('videoid')
1977 video_title = mobj.group('title')
1979 # Get webpage content
1980 webpage = self._download_webpage(url, video_id)
1983 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1984 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1985 video_url = compat_urllib_parse.unquote(video_url)
1987 #Get the uploaded date
1988 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1989 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1990 if upload_date: upload_date = unified_strdate(upload_date)
1992 info = {'id': video_id,
1995 'upload_date': upload_date,
1996 'title': video_title,
2002 class YouJizzIE(InfoExtractor):
2003 """Information extractor for youjizz.com."""
2004 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2006 def _real_extract(self, url):
2007 mobj = re.match(self._VALID_URL, url)
2009 raise ExtractorError(u'Invalid URL: %s' % url)
2011 video_id = mobj.group('videoid')
2013 # Get webpage content
2014 webpage = self._download_webpage(url, video_id)
2016 # Get the video title
2017 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2018 webpage, u'title').strip()
2020 # Get the embed page
2021 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2023 raise ExtractorError(u'ERROR: unable to extract embed page')
2025 embed_page_url = result.group(0).strip()
2026 video_id = result.group('videoid')
2028 webpage = self._download_webpage(embed_page_url, video_id)
2031 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2032 webpage, u'video URL')
2034 info = {'id': video_id,
2036 'title': video_title,
2039 'player_url': embed_page_url}
2043 class EightTracksIE(InfoExtractor):
2045 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2047 def _real_extract(self, url):
2048 mobj = re.match(self._VALID_URL, url)
2050 raise ExtractorError(u'Invalid URL: %s' % url)
2051 playlist_id = mobj.group('id')
2053 webpage = self._download_webpage(url, playlist_id)
2055 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2056 data = json.loads(json_like)
2058 session = str(random.randint(0, 1000000000))
2060 track_count = data['tracks_count']
2061 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2062 next_url = first_url
2064 for i in itertools.count():
2065 api_json = self._download_webpage(next_url, playlist_id,
2066 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2067 errnote=u'Failed to download song information')
2068 api_data = json.loads(api_json)
2069 track_data = api_data[u'set']['track']
2071 'id': track_data['id'],
2072 'url': track_data['track_file_stream_url'],
2073 'title': track_data['performer'] + u' - ' + track_data['name'],
2074 'raw_title': track_data['name'],
2075 'uploader_id': data['user']['login'],
2079 if api_data['set']['at_last_track']:
2081 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2084 class KeekIE(InfoExtractor):
2085 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2088 def _real_extract(self, url):
2089 m = re.match(self._VALID_URL, url)
2090 video_id = m.group('videoID')
2092 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2093 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2094 webpage = self._download_webpage(url, video_id)
2096 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2099 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2100 webpage, u'uploader', fatal=False)
2106 'title': video_title,
2107 'thumbnail': thumbnail,
2108 'uploader': uploader
2112 class TEDIE(InfoExtractor):
2113 _VALID_URL=r'''http://www\.ted\.com/
2115 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2117 ((?P<type_talk>talks)) # We have a simple talk
2119 (/lang/(.*?))? # The url may contain the language
2120 /(?P<name>\w+) # Here goes the name and then ".html"
2124 def suitable(cls, url):
2125 """Receives a URL and returns True if suitable for this IE."""
2126 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2128 def _real_extract(self, url):
2129 m=re.match(self._VALID_URL, url, re.VERBOSE)
2130 if m.group('type_talk'):
2131 return [self._talk_info(url)]
2133 playlist_id=m.group('playlist_id')
2134 name=m.group('name')
2135 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2136 return [self._playlist_videos_info(url,name,playlist_id)]
2138 def _playlist_videos_info(self,url,name,playlist_id=0):
2139 '''Returns the videos of the playlist'''
2141 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2142 ([.\s]*?)data-playlist_item_id="(\d+)"
2143 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2145 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2146 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2147 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2148 m_names=re.finditer(video_name_RE,webpage)
2150 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2151 webpage, 'playlist title')
2153 playlist_entries = []
2154 for m_video, m_name in zip(m_videos,m_names):
2155 video_id=m_video.group('video_id')
2156 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2157 playlist_entries.append(self.url_result(talk_url, 'TED'))
2158 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2160 def _talk_info(self, url, video_id=0):
2161 """Return the video for the talk in the url"""
2162 m = re.match(self._VALID_URL, url,re.VERBOSE)
2163 video_name = m.group('name')
2164 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2165 self.report_extraction(video_name)
2166 # If the url includes the language we get the title translated
2167 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2169 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2170 webpage, 'json data')
2171 info = json.loads(json_data)
2172 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2173 webpage, 'description', flags = re.DOTALL)
2175 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2176 webpage, 'thumbnail')
2179 'url': info['htmlStreams'][-1]['file'],
2182 'thumbnail': thumbnail,
2183 'description': desc,
2187 class MySpassIE(InfoExtractor):
2188 _VALID_URL = r'http://www.myspass.de/.*'
2190 def _real_extract(self, url):
2191 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2193 # video id is the last path element of the URL
2194 # usually there is a trailing slash, so also try the second but last
2195 url_path = compat_urllib_parse_urlparse(url).path
2196 url_parent_path, video_id = os.path.split(url_path)
2198 _, video_id = os.path.split(url_parent_path)
2201 metadata_url = META_DATA_URL_TEMPLATE % video_id
2202 metadata_text = self._download_webpage(metadata_url, video_id)
2203 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2205 # extract values from metadata
2206 url_flv_el = metadata.find('url_flv')
2207 if url_flv_el is None:
2208 raise ExtractorError(u'Unable to extract download url')
2209 video_url = url_flv_el.text
2210 extension = os.path.splitext(video_url)[1][1:]
2211 title_el = metadata.find('title')
2212 if title_el is None:
2213 raise ExtractorError(u'Unable to extract title')
2214 title = title_el.text
2215 format_id_el = metadata.find('format_id')
2216 if format_id_el is None:
2219 format = format_id_el.text
2220 description_el = metadata.find('description')
2221 if description_el is not None:
2222 description = description_el.text
2225 imagePreview_el = metadata.find('imagePreview')
2226 if imagePreview_el is not None:
2227 thumbnail = imagePreview_el.text
2236 'thumbnail': thumbnail,
2237 'description': description
2241 class SpiegelIE(InfoExtractor):
2242 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2244 def _real_extract(self, url):
2245 m = re.match(self._VALID_URL, url)
2246 video_id = m.group('videoID')
2248 webpage = self._download_webpage(url, video_id)
2250 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2253 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2254 xml_code = self._download_webpage(xml_url, video_id,
2255 note=u'Downloading XML', errnote=u'Failed to download XML')
2257 idoc = xml.etree.ElementTree.fromstring(xml_code)
2258 last_type = idoc[-1]
2259 filename = last_type.findall('./filename')[0].text
2260 duration = float(last_type.findall('./duration')[0].text)
2262 video_url = 'http://video2.spiegel.de/flash/' + filename
2263 video_ext = filename.rpartition('.')[2]
2268 'title': video_title,
2269 'duration': duration,
2273 class LiveLeakIE(InfoExtractor):
2275 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2276 IE_NAME = u'liveleak'
2278 def _real_extract(self, url):
2279 mobj = re.match(self._VALID_URL, url)
2281 raise ExtractorError(u'Invalid URL: %s' % url)
2283 video_id = mobj.group('video_id')
2285 webpage = self._download_webpage(url, video_id)
2287 video_url = self._search_regex(r'file: "(.*?)",',
2288 webpage, u'video URL')
2290 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2291 webpage, u'title').replace('LiveLeak.com -', '').strip()
2293 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2294 webpage, u'description', fatal=False)
2296 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2297 webpage, u'uploader', fatal=False)
2303 'title': video_title,
2304 'description': video_description,
2305 'uploader': video_uploader
2312 class TumblrIE(InfoExtractor):
2313 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2315 def _real_extract(self, url):
2316 m_url = re.match(self._VALID_URL, url)
2317 video_id = m_url.group('id')
2318 blog = m_url.group('blog_name')
2320 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2321 webpage = self._download_webpage(url, video_id)
2323 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2324 video = re.search(re_video, webpage)
2326 raise ExtractorError(u'Unable to extract video')
2327 video_url = video.group('video_url')
2328 ext = video.group('ext')
2330 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2331 webpage, u'thumbnail', fatal=False) # We pick the first poster
2332 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2334 # The only place where you can get a title, it's not complete,
2335 # but searching in other places doesn't work for all videos
2336 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2337 webpage, u'title', flags=re.DOTALL)
2339 return [{'id': video_id,
2341 'title': video_title,
2342 'thumbnail': video_thumbnail,
2346 class BandcampIE(InfoExtractor):
2347 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2349 def _real_extract(self, url):
2350 mobj = re.match(self._VALID_URL, url)
2351 title = mobj.group('title')
2352 webpage = self._download_webpage(url, title)
2353 # We get the link to the free download page
2354 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2355 if m_download is None:
2356 raise ExtractorError(u'No free songs found')
2358 download_link = m_download.group(1)
2359 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2360 webpage, re.MULTILINE|re.DOTALL).group('id')
2362 download_webpage = self._download_webpage(download_link, id,
2363 'Downloading free downloads page')
2364 # We get the dictionary of the track from some javascrip code
2365 info = re.search(r'items: (.*?),$',
2366 download_webpage, re.MULTILINE).group(1)
2367 info = json.loads(info)[0]
2368 # We pick mp3-320 for now, until format selection can be easily implemented.
2369 mp3_info = info[u'downloads'][u'mp3-320']
2370 # If we try to use this url it says the link has expired
2371 initial_url = mp3_info[u'url']
2372 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2373 m_url = re.match(re_url, initial_url)
2374 #We build the url we will use to get the final track url
2375 # This url is build in Bandcamp in the script download_bunde_*.js
2376 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2377 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2378 # If we could correctly generate the .rand field the url would be
2379 #in the "download_url" key
2380 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2382 track_info = {'id':id,
2383 'title' : info[u'title'],
2386 'thumbnail' : info[u'thumb_url'],
2387 'uploader' : info[u'artist']
2392 class RedTubeIE(InfoExtractor):
2393 """Information Extractor for redtube"""
2394 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2396 def _real_extract(self,url):
2397 mobj = re.match(self._VALID_URL, url)
2399 raise ExtractorError(u'Invalid URL: %s' % url)
2401 video_id = mobj.group('id')
2402 video_extension = 'mp4'
2403 webpage = self._download_webpage(url, video_id)
2405 self.report_extraction(video_id)
2407 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2408 webpage, u'video URL')
2410 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2416 'ext': video_extension,
2417 'title': video_title,
2420 class InaIE(InfoExtractor):
2421 """Information Extractor for Ina.fr"""
2422 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2424 def _real_extract(self,url):
2425 mobj = re.match(self._VALID_URL, url)
2427 video_id = mobj.group('id')
2428 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2429 video_extension = 'mp4'
2430 webpage = self._download_webpage(mrss_url, video_id)
2432 self.report_extraction(video_id)
2434 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2435 webpage, u'video URL')
2437 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2443 'ext': video_extension,
2444 'title': video_title,
2447 class HowcastIE(InfoExtractor):
2448 """Information Extractor for Howcast.com"""
2449 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2451 def _real_extract(self, url):
2452 mobj = re.match(self._VALID_URL, url)
2454 video_id = mobj.group('id')
2455 webpage_url = 'http://www.howcast.com/videos/' + video_id
2456 webpage = self._download_webpage(webpage_url, video_id)
2458 self.report_extraction(video_id)
2460 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2461 webpage, u'video URL')
2463 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2466 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2467 webpage, u'description', fatal=False)
2469 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2470 webpage, u'thumbnail', fatal=False)
2476 'title': video_title,
2477 'description': video_description,
2478 'thumbnail': thumbnail,
2481 class VineIE(InfoExtractor):
2482 """Information Extractor for Vine.co"""
2483 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2485 def _real_extract(self, url):
2486 mobj = re.match(self._VALID_URL, url)
2488 video_id = mobj.group('id')
2489 webpage_url = 'https://vine.co/v/' + video_id
2490 webpage = self._download_webpage(webpage_url, video_id)
2492 self.report_extraction(video_id)
2494 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2495 webpage, u'video URL')
2497 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2500 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2501 webpage, u'thumbnail', fatal=False)
2503 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2504 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2510 'title': video_title,
2511 'thumbnail': thumbnail,
2512 'uploader': uploader,
2515 class FlickrIE(InfoExtractor):
2516 """Information Extractor for Flickr videos"""
2517 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2519 def _real_extract(self, url):
2520 mobj = re.match(self._VALID_URL, url)
2522 video_id = mobj.group('id')
2523 video_uploader_id = mobj.group('uploader_id')
2524 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2525 webpage = self._download_webpage(webpage_url, video_id)
2527 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2529 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2530 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2532 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2533 first_xml, u'node_id')
2535 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2536 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2538 self.report_extraction(video_id)
2540 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2542 raise ExtractorError(u'Unable to extract video url')
2543 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2545 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2546 webpage, u'video title')
2548 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2549 webpage, u'description', fatal=False)
2551 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2552 webpage, u'thumbnail', fatal=False)
2558 'title': video_title,
2559 'description': video_description,
2560 'thumbnail': thumbnail,
2561 'uploader_id': video_uploader_id,
2564 class TeamcocoIE(InfoExtractor):
2565 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2567 def _real_extract(self, url):
2568 mobj = re.match(self._VALID_URL, url)
2570 raise ExtractorError(u'Invalid URL: %s' % url)
2571 url_title = mobj.group('url_title')
2572 webpage = self._download_webpage(url, url_title)
2574 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2575 webpage, u'video id')
2577 self.report_extraction(video_id)
2579 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2582 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2583 webpage, u'thumbnail', fatal=False)
2585 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2586 webpage, u'description', fatal=False)
2588 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2589 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2591 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2598 'title': video_title,
2599 'thumbnail': thumbnail,
2600 'description': video_description,
2603 class XHamsterIE(InfoExtractor):
2604 """Information Extractor for xHamster"""
2605 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2607 def _real_extract(self,url):
2608 mobj = re.match(self._VALID_URL, url)
2610 video_id = mobj.group('id')
2611 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2612 webpage = self._download_webpage(mrss_url, video_id)
2614 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2616 raise ExtractorError(u'Unable to extract media URL')
2617 if len(mobj.group('server')) == 0:
2618 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2620 video_url = mobj.group('server')+'/key='+mobj.group('file')
2621 video_extension = video_url.split('.')[-1]
2623 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2626 # Can't see the description anywhere in the UI
2627 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2628 # webpage, u'description', fatal=False)
2629 # if video_description: video_description = unescapeHTML(video_description)
2631 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2633 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2635 video_upload_date = None
2636 self._downloader.report_warning(u'Unable to extract upload date')
2638 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2639 webpage, u'uploader id', default=u'anonymous')
2641 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2642 webpage, u'thumbnail', fatal=False)
2647 'ext': video_extension,
2648 'title': video_title,
2649 # 'description': video_description,
2650 'upload_date': video_upload_date,
2651 'uploader_id': video_uploader_id,
2652 'thumbnail': video_thumbnail
2655 class HypemIE(InfoExtractor):
2656 """Information Extractor for hypem"""
2657 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2659 def _real_extract(self, url):
2660 mobj = re.match(self._VALID_URL, url)
2662 raise ExtractorError(u'Invalid URL: %s' % url)
2663 track_id = mobj.group(1)
2665 data = { 'ax': 1, 'ts': time.time() }
2666 data_encoded = compat_urllib_parse.urlencode(data)
2667 complete_url = url + "?" + data_encoded
2668 request = compat_urllib_request.Request(complete_url)
2669 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2670 cookie = urlh.headers.get('Set-Cookie', '')
2672 self.report_extraction(track_id)
2674 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2675 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2677 track_list = json.loads(html_tracks)
2678 track = track_list[u'tracks'][0]
2680 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2683 track_id = track[u"id"]
2684 artist = track[u"artist"]
2685 title = track[u"song"]
2687 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2688 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2689 request.add_header('cookie', cookie)
2690 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2692 song_data = json.loads(song_data_json)
2694 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2695 final_url = song_data[u"url"]
2705 class Vbox7IE(InfoExtractor):
2706 """Information Extractor for Vbox7"""
2707 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2709 def _real_extract(self,url):
2710 mobj = re.match(self._VALID_URL, url)
2712 raise ExtractorError(u'Invalid URL: %s' % url)
2713 video_id = mobj.group(1)
2715 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2716 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2717 redirect_url = urlh.geturl() + new_location
2718 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2720 title = self._html_search_regex(r'<title>(.*)</title>',
2721 webpage, u'title').split('/')[0].strip()
2724 info_url = "http://vbox7.com/play/magare.do"
2725 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2726 info_request = compat_urllib_request.Request(info_url, data)
2727 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2728 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2729 if info_response is None:
2730 raise ExtractorError(u'Unable to extract the media url')
2731 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2738 'thumbnail': thumbnail_url,
2742 def gen_extractors():
2743 """ Return a list of an instance of every supported extractor.
2744 The order does matter; the first extractor matched is the one handling the URL.
2747 YoutubePlaylistIE(),
2772 StanfordOpenClassroomIE(),
2782 WorldStarHipHopIE(),
2812 def get_info_extractor(ie_name):
2813 """Returns the info extractor class with the given ie_name"""
2814 return globals()[ie_name+'IE']