2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
29 class YoutubeIE(InfoExtractor):
30 """Information extractor for youtube.com."""
34 (?:https?://)? # http(s):// (optional)
35 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
36 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
37 (?:.*?\#/)? # handle anchor (#/) redirect urls
38 (?: # the various things that can precede the ID:
39 (?:(?:v|embed|e)/) # v/ or embed/ or e/
40 |(?: # or the v= param in all its forms
41 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
42 (?:\?|\#!?) # the params delimiter ? or # or #!
43 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
46 )? # optional -> youtube.com/xxxx is OK
47 )? # all until now is optional -> you can pass the naked ID
48 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
49 (?(1).+)? # if we found the ID, everything can follow
51 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
52 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
53 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
54 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
55 _NETRC_MACHINE = 'youtube'
56 # Listed in order of quality
57 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
58 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
65 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
90 def suitable(cls, url):
91 """Receives a URL and returns True if suitable for this IE."""
92 if YoutubePlaylistIE.suitable(url): return False
93 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
95 def report_lang(self):
96 """Report attempt to set language."""
97 self.to_screen(u'Setting language')
99 def report_login(self):
100 """Report attempt to log in."""
101 self.to_screen(u'Logging in')
103 def report_video_webpage_download(self, video_id):
104 """Report attempt to download video webpage."""
105 self.to_screen(u'%s: Downloading video webpage' % video_id)
107 def report_video_info_webpage_download(self, video_id):
108 """Report attempt to download video info webpage."""
109 self.to_screen(u'%s: Downloading video info webpage' % video_id)
111 def report_video_subtitles_download(self, video_id):
112 """Report attempt to download video info webpage."""
113 self.to_screen(u'%s: Checking available subtitles' % video_id)
115 def report_video_subtitles_request(self, video_id, sub_lang, format):
116 """Report attempt to download video info webpage."""
117 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
119 def report_video_subtitles_available(self, video_id, sub_lang_list):
120 """Report available subtitles."""
121 sub_lang = ",".join(list(sub_lang_list.keys()))
122 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
124 def report_information_extraction(self, video_id):
125 """Report attempt to extract video information."""
126 self.to_screen(u'%s: Extracting video information' % video_id)
128 def report_unavailable_format(self, video_id, format):
129 """Report extracted video URL."""
130 self.to_screen(u'%s: Format %s not available' % (video_id, format))
132 def report_rtmp_download(self):
133 """Indicate the download will use the RTMP protocol."""
134 self.to_screen(u'RTMP download detected')
137 def _decrypt_signature(s):
138 """Decrypt the key the two subkeys must have a length of 43"""
140 if len(a) != 43 or len(b) != 43:
141 raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
142 b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
144 s_dec = '.'.join((a,b))[::-1]
147 def _get_available_subtitles(self, video_id):
148 self.report_video_subtitles_download(video_id)
149 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
151 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
153 return (u'unable to download video subtitles: %s' % compat_str(err), None)
154 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
155 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
156 if not sub_lang_list:
157 return (u'video doesn\'t have subtitles', None)
160 def _list_available_subtitles(self, video_id):
161 sub_lang_list = self._get_available_subtitles(video_id)
162 self.report_video_subtitles_available(video_id, sub_lang_list)
164 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
167 (error_message, sub_lang, sub)
169 self.report_video_subtitles_request(video_id, sub_lang, format)
170 params = compat_urllib_parse.urlencode({
176 url = 'http://www.youtube.com/api/timedtext?' + params
178 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
179 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
180 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
182 return (u'Did not fetch video subtitles', None, None)
183 return (None, sub_lang, sub)
185 def _request_automatic_caption(self, video_id, webpage):
186 """We need the webpage for getting the captions url, pass it as an
187 argument to speed up the process."""
188 sub_lang = self._downloader.params.get('subtitleslang') or 'en'
189 sub_format = self._downloader.params.get('subtitlesformat')
190 self.to_screen(u'%s: Looking for automatic captions' % video_id)
191 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
192 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
194 return [(err_msg, None, None)]
195 player_config = json.loads(mobj.group(1))
197 args = player_config[u'args']
198 caption_url = args[u'ttsurl']
199 timestamp = args[u'timestamp']
200 params = compat_urllib_parse.urlencode({
207 subtitles_url = caption_url + '&' + params
208 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
209 return [(None, sub_lang, sub)]
211 return [(err_msg, None, None)]
213 def _extract_subtitle(self, video_id):
215 Return a list with a tuple:
216 [(error_message, sub_lang, sub)]
218 sub_lang_list = self._get_available_subtitles(video_id)
219 sub_format = self._downloader.params.get('subtitlesformat')
220 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
221 return [(sub_lang_list[0], None, None)]
222 if self._downloader.params.get('subtitleslang', False):
223 sub_lang = self._downloader.params.get('subtitleslang')
224 elif 'en' in sub_lang_list:
227 sub_lang = list(sub_lang_list.keys())[0]
228 if not sub_lang in sub_lang_list:
229 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
231 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
234 def _extract_all_subtitles(self, video_id):
235 sub_lang_list = self._get_available_subtitles(video_id)
236 sub_format = self._downloader.params.get('subtitlesformat')
237 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
238 return [(sub_lang_list[0], None, None)]
240 for sub_lang in sub_lang_list:
241 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
242 subtitles.append(subtitle)
245 def _print_formats(self, formats):
246 print('Available formats:')
248 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
250 def _real_initialize(self):
251 if self._downloader is None:
256 downloader_params = self._downloader.params
258 # Attempt to use provided username and password or .netrc data
259 if downloader_params.get('username', None) is not None:
260 username = downloader_params['username']
261 password = downloader_params['password']
262 elif downloader_params.get('usenetrc', False):
264 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
269 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
270 except (IOError, netrc.NetrcParseError) as err:
271 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
275 request = compat_urllib_request.Request(self._LANG_URL)
278 compat_urllib_request.urlopen(request).read()
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
283 # No authentication to be performed
287 request = compat_urllib_request.Request(self._LOGIN_URL)
289 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
290 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
291 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
296 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
298 galx = match.group(1)
300 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
306 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
310 u'PersistentCookie': u'yes',
312 u'bgresponse': u'js_disabled',
313 u'checkConnection': u'',
314 u'checkedDomains': u'youtube',
320 u'signIn': u'Sign in',
322 u'service': u'youtube',
326 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
328 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
329 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
330 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
333 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
334 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
335 self._downloader.report_warning(u'unable to log in: bad username or password')
337 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
338 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
344 'action_confirm': 'Confirm',
346 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
348 self.report_age_confirmation()
349 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
350 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
351 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
353 def _extract_id(self, url):
354 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
356 raise ExtractorError(u'Invalid URL: %s' % url)
357 video_id = mobj.group(2)
360 def _real_extract(self, url):
361 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
362 mobj = re.search(self._NEXT_URL_RE, url)
364 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
365 video_id = self._extract_id(url)
368 self.report_video_webpage_download(video_id)
369 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
370 request = compat_urllib_request.Request(url)
372 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
376 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
378 # Attempt to extract SWF player URL
379 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
381 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
386 self.report_video_info_webpage_download(video_id)
387 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
388 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
389 % (video_id, el_type))
390 video_info_webpage = self._download_webpage(video_info_url, video_id,
392 errnote='unable to download video info webpage')
393 video_info = compat_parse_qs(video_info_webpage)
394 if 'token' in video_info:
396 if 'token' not in video_info:
397 if 'reason' in video_info:
398 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
400 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
402 # Check for "rental" videos
403 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
404 raise ExtractorError(u'"rental" videos not supported')
406 # Start extracting information
407 self.report_information_extraction(video_id)
410 if 'author' not in video_info:
411 raise ExtractorError(u'Unable to extract uploader name')
412 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
415 video_uploader_id = None
416 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
418 video_uploader_id = mobj.group(1)
420 self._downloader.report_warning(u'unable to extract uploader nickname')
423 if 'title' not in video_info:
424 raise ExtractorError(u'Unable to extract video title')
425 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
428 if 'thumbnail_url' not in video_info:
429 self._downloader.report_warning(u'unable to extract video thumbnail')
431 else: # don't panic if we can't find it
432 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
436 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
438 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
439 upload_date = unified_strdate(upload_date)
442 video_description = get_element_by_id("eow-description", video_webpage)
443 if video_description:
444 video_description = clean_html(video_description)
446 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
448 video_description = unescapeHTML(fd_mobj.group(1))
450 video_description = u''
453 video_subtitles = None
455 if self._downloader.params.get('writesubtitles', False):
456 video_subtitles = self._extract_subtitle(video_id)
458 (sub_error, sub_lang, sub) = video_subtitles[0]
460 # We try with the automatic captions
461 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
462 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
466 # We report the original error
467 self._downloader.report_warning(sub_error)
469 if self._downloader.params.get('allsubtitles', False):
470 video_subtitles = self._extract_all_subtitles(video_id)
471 for video_subtitle in video_subtitles:
472 (sub_error, sub_lang, sub) = video_subtitle
474 self._downloader.report_warning(sub_error)
476 if self._downloader.params.get('listsubtitles', False):
477 sub_lang_list = self._list_available_subtitles(video_id)
480 if 'length_seconds' not in video_info:
481 self._downloader.report_warning(u'unable to extract video duration')
484 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
487 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
489 # Decide which formats to download
490 req_format = self._downloader.params.get('format', None)
493 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
494 info = json.loads(mobj.group(1))
496 if args.get('ptk','') == 'vevo' or 'dashmpd':
497 # Vevo videos with encrypted signatures
498 self.to_screen(u'%s: Vevo video detected.' % video_id)
499 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
503 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
504 self.report_rtmp_download()
505 video_url_list = [(None, video_info['conn'][0])]
506 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
508 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
509 url_data = compat_parse_qs(url_data_str)
510 if 'itag' in url_data and 'url' in url_data:
511 url = url_data['url'][0]
512 if 'sig' in url_data:
513 url += '&signature=' + url_data['sig'][0]
514 elif 's' in url_data:
515 signature = self._decrypt_signature(url_data['s'][0])
516 url += '&signature=' + signature
517 if 'ratebypass' not in url:
518 url += '&ratebypass=yes'
519 url_map[url_data['itag'][0]] = url
521 format_limit = self._downloader.params.get('format_limit', None)
522 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
523 if format_limit is not None and format_limit in available_formats:
524 format_list = available_formats[available_formats.index(format_limit):]
526 format_list = available_formats
527 existing_formats = [x for x in format_list if x in url_map]
528 if len(existing_formats) == 0:
529 raise ExtractorError(u'no known formats available for video')
530 if self._downloader.params.get('listformats', None):
531 self._print_formats(existing_formats)
533 if req_format is None or req_format == 'best':
534 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
535 elif req_format == 'worst':
536 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
537 elif req_format in ('-1', 'all'):
538 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
540 # Specific formats. We pick the first in a slash-delimeted sequence.
541 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
542 req_formats = req_format.split('/')
543 video_url_list = None
544 for rf in req_formats:
546 video_url_list = [(rf, url_map[rf])]
548 if video_url_list is None:
549 raise ExtractorError(u'requested format not available')
551 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
554 for format_param, video_real_url in video_url_list:
556 video_extension = self._video_extensions.get(format_param, 'flv')
558 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
559 self._video_dimensions.get(format_param, '???'))
563 'url': video_real_url,
564 'uploader': video_uploader,
565 'uploader_id': video_uploader_id,
566 'upload_date': upload_date,
567 'title': video_title,
568 'ext': video_extension,
569 'format': video_format,
570 'thumbnail': video_thumbnail,
571 'description': video_description,
572 'player_url': player_url,
573 'subtitles': video_subtitles,
574 'duration': video_duration
579 class MetacafeIE(InfoExtractor):
580 """Information Extractor for metacafe.com."""
582 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
583 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
584 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
585 IE_NAME = u'metacafe'
587 def report_disclaimer(self):
588 """Report disclaimer retrieval."""
589 self.to_screen(u'Retrieving disclaimer')
591 def _real_initialize(self):
592 # Retrieve disclaimer
593 request = compat_urllib_request.Request(self._DISCLAIMER)
595 self.report_disclaimer()
596 disclaimer = compat_urllib_request.urlopen(request).read()
597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
598 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
603 'submit': "Continue - I'm over 18",
605 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
607 self.report_age_confirmation()
608 disclaimer = compat_urllib_request.urlopen(request).read()
609 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
610 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
612 def _real_extract(self, url):
613 # Extract id and simplified title from URL
614 mobj = re.match(self._VALID_URL, url)
616 raise ExtractorError(u'Invalid URL: %s' % url)
618 video_id = mobj.group(1)
620 # Check if video comes from YouTube
621 mobj2 = re.match(r'^yt-(.*)$', video_id)
622 if mobj2 is not None:
623 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
625 # Retrieve video webpage to extract further information
626 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
628 # Extract URL, uploader and title from webpage
629 self.report_extraction(video_id)
630 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
632 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
633 video_extension = mediaURL[-3:]
635 # Extract gdaKey if available
636 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
640 gdaKey = mobj.group(1)
641 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
643 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
645 raise ExtractorError(u'Unable to extract media URL')
646 vardict = compat_parse_qs(mobj.group(1))
647 if 'mediaData' not in vardict:
648 raise ExtractorError(u'Unable to extract media URL')
649 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
651 raise ExtractorError(u'Unable to extract media URL')
652 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
653 video_extension = mediaURL[-3:]
654 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
656 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
658 raise ExtractorError(u'Unable to extract title')
659 video_title = mobj.group(1).decode('utf-8')
661 mobj = re.search(r'submitter=(.*?);', webpage)
663 raise ExtractorError(u'Unable to extract uploader nickname')
664 video_uploader = mobj.group(1)
667 'id': video_id.decode('utf-8'),
668 'url': video_url.decode('utf-8'),
669 'uploader': video_uploader.decode('utf-8'),
671 'title': video_title,
672 'ext': video_extension.decode('utf-8'),
675 class DailymotionIE(InfoExtractor):
676 """Information Extractor for Dailymotion"""
678 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
679 IE_NAME = u'dailymotion'
681 def _real_extract(self, url):
682 # Extract id and simplified title from URL
683 mobj = re.match(self._VALID_URL, url)
685 raise ExtractorError(u'Invalid URL: %s' % url)
687 video_id = mobj.group(1).split('_')[0].split('?')[0]
689 video_extension = 'mp4'
691 # Retrieve video webpage to extract further information
692 request = compat_urllib_request.Request(url)
693 request.add_header('Cookie', 'family_filter=off')
694 webpage = self._download_webpage(request, video_id)
696 # Extract URL, uploader and title from webpage
697 self.report_extraction(video_id)
698 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
700 raise ExtractorError(u'Unable to extract media URL')
701 flashvars = compat_urllib_parse.unquote(mobj.group(1))
703 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
706 self.to_screen(u'Using %s' % key)
709 raise ExtractorError(u'Unable to extract video URL')
711 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
713 raise ExtractorError(u'Unable to extract video URL')
715 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
717 # TODO: support choosing qualities
719 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
721 raise ExtractorError(u'Unable to extract title')
722 video_title = unescapeHTML(mobj.group('title'))
724 video_uploader = None
725 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
726 # Looking for official user
727 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
728 webpage, 'video uploader')
730 video_upload_date = None
731 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
733 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
738 'uploader': video_uploader,
739 'upload_date': video_upload_date,
740 'title': video_title,
741 'ext': video_extension,
745 class PhotobucketIE(InfoExtractor):
746 """Information extractor for photobucket.com."""
748 # TODO: the original _VALID_URL was:
749 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
750 # Check if it's necessary to keep the old extracion process
751 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
752 IE_NAME = u'photobucket'
754 def _real_extract(self, url):
755 # Extract id from URL
756 mobj = re.match(self._VALID_URL, url)
758 raise ExtractorError(u'Invalid URL: %s' % url)
760 video_id = mobj.group('id')
762 video_extension = mobj.group('ext')
764 # Retrieve video webpage to extract further information
765 webpage = self._download_webpage(url, video_id)
767 # Extract URL, uploader, and title from webpage
768 self.report_extraction(video_id)
769 # We try first by looking the javascript code:
770 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
772 info = json.loads(mobj.group('json'))
775 'url': info[u'downloadUrl'],
776 'uploader': info[u'username'],
777 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
778 'title': info[u'title'],
779 'ext': video_extension,
780 'thumbnail': info[u'thumbUrl'],
783 # We try looking in other parts of the webpage
784 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
785 webpage, u'video URL')
787 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
789 raise ExtractorError(u'Unable to extract title')
790 video_title = mobj.group(1).decode('utf-8')
791 video_uploader = mobj.group(2).decode('utf-8')
794 'id': video_id.decode('utf-8'),
795 'url': video_url.decode('utf-8'),
796 'uploader': video_uploader,
798 'title': video_title,
799 'ext': video_extension.decode('utf-8'),
803 class YahooIE(InfoExtractor):
804 """Information extractor for screen.yahoo.com."""
805 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
807 def _real_extract(self, url):
808 mobj = re.match(self._VALID_URL, url)
810 raise ExtractorError(u'Invalid URL: %s' % url)
811 video_id = mobj.group('id')
812 webpage = self._download_webpage(url, video_id)
813 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
816 # TODO: Check which url parameters are required
817 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
818 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
819 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
820 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
821 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
822 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
824 self.report_extraction(video_id)
825 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
827 raise ExtractorError(u'Unable to extract video info')
828 video_title = m_info.group('title')
829 video_description = m_info.group('description')
830 video_thumb = m_info.group('thumb')
831 video_date = m_info.group('date')
832 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
834 # TODO: Find a way to get mp4 videos
835 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
836 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
837 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
838 video_url = m_rest.group('url')
839 video_path = m_rest.group('path')
841 raise ExtractorError(u'Unable to extract video url')
843 else: # We have to use a different method if another id is defined
844 long_id = m_id.group('new_id')
845 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
846 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
847 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
848 info = json.loads(json_str)
849 res = info[u'query'][u'results'][u'mediaObj'][0]
850 stream = res[u'streams'][0]
851 video_path = stream[u'path']
852 video_url = stream[u'host']
854 video_title = meta[u'title']
855 video_description = meta[u'description']
856 video_thumb = meta[u'thumbnail']
857 video_date = None # I can't find it
862 'play_path': video_path,
864 'description': video_description,
865 'thumbnail': video_thumb,
866 'upload_date': video_date,
871 class VimeoIE(InfoExtractor):
872 """Information extractor for vimeo.com."""
874 # _VALID_URL matches Vimeo URLs
875 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
878 def _verify_video_password(self, url, video_id, webpage):
879 password = self._downloader.params.get('password', None)
881 raise ExtractorError(u'This video is protected by a password, use the --password option')
882 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
883 data = compat_urllib_parse.urlencode({'password': password,
885 # I didn't manage to use the password with https
886 if url.startswith('https'):
887 pass_url = url.replace('https','http')
890 password_request = compat_urllib_request.Request(pass_url+'/password', data)
891 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
892 password_request.add_header('Cookie', 'xsrft=%s' % token)
893 pass_web = self._download_webpage(password_request, video_id,
894 u'Verifying the password',
897 def _real_extract(self, url, new_video=True):
898 # Extract ID from URL
899 mobj = re.match(self._VALID_URL, url)
901 raise ExtractorError(u'Invalid URL: %s' % url)
903 video_id = mobj.group('id')
904 if not mobj.group('proto'):
905 url = 'https://' + url
906 if mobj.group('direct_link') or mobj.group('pro'):
907 url = 'https://vimeo.com/' + video_id
909 # Retrieve video webpage to extract further information
910 request = compat_urllib_request.Request(url, None, std_headers)
911 webpage = self._download_webpage(request, video_id)
913 # Now we begin extracting as much information as we can from what we
914 # retrieved. First we extract the information common to all extractors,
915 # and latter we extract those that are Vimeo specific.
916 self.report_extraction(video_id)
918 # Extract the config JSON
920 config = webpage.split(' = {config:')[1].split(',assets:')[0]
921 config = json.loads(config)
923 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
924 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
926 if re.search('If so please provide the correct password.', webpage):
927 self._verify_video_password(url, video_id, webpage)
928 return self._real_extract(url)
930 raise ExtractorError(u'Unable to extract info section')
933 video_title = config["video"]["title"]
935 # Extract uploader and uploader_id
936 video_uploader = config["video"]["owner"]["name"]
937 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
939 # Extract video thumbnail
940 video_thumbnail = config["video"]["thumbnail"]
942 # Extract video description
943 video_description = get_element_by_attribute("itemprop", "description", webpage)
944 if video_description: video_description = clean_html(video_description)
945 else: video_description = u''
947 # Extract upload date
948 video_upload_date = None
949 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
951 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
953 # Vimeo specific: extract request signature and timestamp
954 sig = config['request']['signature']
955 timestamp = config['request']['timestamp']
957 # Vimeo specific: extract video codec and quality information
958 # First consider quality, then codecs, then take everything
959 # TODO bind to format param
960 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
961 files = { 'hd': [], 'sd': [], 'other': []}
962 for codec_name, codec_extension in codecs:
963 if codec_name in config["video"]["files"]:
964 if 'hd' in config["video"]["files"][codec_name]:
965 files['hd'].append((codec_name, codec_extension, 'hd'))
966 elif 'sd' in config["video"]["files"][codec_name]:
967 files['sd'].append((codec_name, codec_extension, 'sd'))
969 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
971 for quality in ('hd', 'sd', 'other'):
972 if len(files[quality]) > 0:
973 video_quality = files[quality][0][2]
974 video_codec = files[quality][0][0]
975 video_extension = files[quality][0][1]
976 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
979 raise ExtractorError(u'No known codec found')
981 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
982 %(video_id, sig, timestamp, video_quality, video_codec.upper())
987 'uploader': video_uploader,
988 'uploader_id': video_uploader_id,
989 'upload_date': video_upload_date,
990 'title': video_title,
991 'ext': video_extension,
992 'thumbnail': video_thumbnail,
993 'description': video_description,
997 class ArteTvIE(InfoExtractor):
998 """arte.tv information extractor."""
1000 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1001 _LIVE_URL = r'index-[0-9]+\.html$'
1003 IE_NAME = u'arte.tv'
1005 def fetch_webpage(self, url):
1006 request = compat_urllib_request.Request(url)
1008 self.report_download_webpage(url)
1009 webpage = compat_urllib_request.urlopen(request).read()
1010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1012 except ValueError as err:
1013 raise ExtractorError(u'Invalid URL: %s' % url)
1016 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1017 page = self.fetch_webpage(url)
1018 mobj = re.search(regex, page, regexFlags)
1022 raise ExtractorError(u'Invalid URL: %s' % url)
1024 for (i, key, err) in matchTuples:
1025 if mobj.group(i) is None:
1026 raise ExtractorError(err)
1028 info[key] = mobj.group(i)
1032 def extractLiveStream(self, url):
1033 video_lang = url.split('/')[-4]
1034 info = self.grep_webpage(
1036 r'src="(.*?/videothek_js.*?\.js)',
1039 (1, 'url', u'Invalid URL: %s' % url)
1042 http_host = url.split('/')[2]
1043 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1044 info = self.grep_webpage(
1046 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1047 '(http://.*?\.swf).*?' +
1051 (1, 'path', u'could not extract video path: %s' % url),
1052 (2, 'player', u'could not extract video player: %s' % url),
1053 (3, 'url', u'could not extract video url: %s' % url)
1056 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1058 def extractPlus7Stream(self, url):
1059 video_lang = url.split('/')[-3]
1060 info = self.grep_webpage(
1062 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1065 (1, 'url', u'Invalid URL: %s' % url)
1068 next_url = compat_urllib_parse.unquote(info.get('url'))
1069 info = self.grep_webpage(
1071 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1074 (1, 'url', u'Could not find <video> tag: %s' % url)
1077 next_url = compat_urllib_parse.unquote(info.get('url'))
1079 info = self.grep_webpage(
1081 r'<video id="(.*?)".*?>.*?' +
1082 '<name>(.*?)</name>.*?' +
1083 '<dateVideo>(.*?)</dateVideo>.*?' +
1084 '<url quality="hd">(.*?)</url>',
1087 (1, 'id', u'could not extract video id: %s' % url),
1088 (2, 'title', u'could not extract video title: %s' % url),
1089 (3, 'date', u'could not extract video date: %s' % url),
1090 (4, 'url', u'could not extract video url: %s' % url)
1095 'id': info.get('id'),
1096 'url': compat_urllib_parse.unquote(info.get('url')),
1097 'uploader': u'arte.tv',
1098 'upload_date': unified_strdate(info.get('date')),
1099 'title': info.get('title').decode('utf-8'),
1105 def _real_extract(self, url):
1106 video_id = url.split('/')[-1]
1107 self.report_extraction(video_id)
1109 if re.search(self._LIVE_URL, video_id) is not None:
1110 self.extractLiveStream(url)
1113 info = self.extractPlus7Stream(url)
1118 class GenericIE(InfoExtractor):
1119 """Generic last-resort information extractor."""
1122 IE_NAME = u'generic'
1124 def report_download_webpage(self, video_id):
1125 """Report webpage download."""
1126 if not self._downloader.params.get('test', False):
1127 self._downloader.report_warning(u'Falling back on generic information extractor.')
1128 super(GenericIE, self).report_download_webpage(video_id)
1130 def report_following_redirect(self, new_url):
1131 """Report information extraction."""
1132 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1134 def _test_redirect(self, url):
1135 """Check if it is a redirect, like url shorteners, in case return the new url."""
1136 class HeadRequest(compat_urllib_request.Request):
1137 def get_method(self):
1140 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1142 Subclass the HTTPRedirectHandler to make it use our
1143 HeadRequest also on the redirected URL
1145 def redirect_request(self, req, fp, code, msg, headers, newurl):
1146 if code in (301, 302, 303, 307):
1147 newurl = newurl.replace(' ', '%20')
1148 newheaders = dict((k,v) for k,v in req.headers.items()
1149 if k.lower() not in ("content-length", "content-type"))
1150 return HeadRequest(newurl,
1152 origin_req_host=req.get_origin_req_host(),
1155 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1157 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1159 Fallback to GET if HEAD is not allowed (405 HTTP error)
1161 def http_error_405(self, req, fp, code, msg, headers):
1165 newheaders = dict((k,v) for k,v in req.headers.items()
1166 if k.lower() not in ("content-length", "content-type"))
1167 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1169 origin_req_host=req.get_origin_req_host(),
1173 opener = compat_urllib_request.OpenerDirector()
1174 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1175 HTTPMethodFallback, HEADRedirectHandler,
1176 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1177 opener.add_handler(handler())
1179 response = opener.open(HeadRequest(url))
1180 if response is None:
1181 raise ExtractorError(u'Invalid URL protocol')
1182 new_url = response.geturl()
1187 self.report_following_redirect(new_url)
1190 def _real_extract(self, url):
1191 new_url = self._test_redirect(url)
1192 if new_url: return [self.url_result(new_url)]
1194 video_id = url.split('/')[-1]
1196 webpage = self._download_webpage(url, video_id)
1197 except ValueError as err:
1198 # since this is the last-resort InfoExtractor, if
1199 # this error is thrown, it'll be thrown here
1200 raise ExtractorError(u'Invalid URL: %s' % url)
1202 self.report_extraction(video_id)
1203 # Start with something easy: JW Player in SWFObject
1204 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1206 # Broaden the search a little bit
1207 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1209 # Broaden the search a little bit: JWPlayer JS loader
1210 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1212 # Try to find twitter cards info
1213 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1215 # We look for Open Graph info:
1216 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1217 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1218 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1219 if m_video_type is not None:
1220 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1222 raise ExtractorError(u'Invalid URL: %s' % url)
1224 # It's possible that one of the regexes
1225 # matched, but returned an empty group:
1226 if mobj.group(1) is None:
1227 raise ExtractorError(u'Invalid URL: %s' % url)
1229 video_url = compat_urllib_parse.unquote(mobj.group(1))
1230 video_id = os.path.basename(video_url)
1232 # here's a fun little line of code for you:
1233 video_extension = os.path.splitext(video_id)[1][1:]
1234 video_id = os.path.splitext(video_id)[0]
1236 # it's tempting to parse this further, but you would
1237 # have to take into account all the variations like
1238 # Video Title - Site Name
1239 # Site Name | Video Title
1240 # Video Title - Tagline | Site Name
1241 # and so on and so forth; it's just not practical
1242 video_title = self._html_search_regex(r'<title>(.*)</title>',
1243 webpage, u'video title')
1245 # video uploader is domain name
1246 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1247 url, u'video uploader')
1252 'uploader': video_uploader,
1253 'upload_date': None,
1254 'title': video_title,
1255 'ext': video_extension,
1259 class YoutubeSearchIE(SearchInfoExtractor):
1260 """Information Extractor for YouTube search queries."""
1261 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1263 IE_NAME = u'youtube:search'
1264 _SEARCH_KEY = 'ytsearch'
1266 def report_download_page(self, query, pagenum):
1267 """Report attempt to download search page with given number."""
1268 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1270 def _get_n_results(self, query, n):
1271 """Get a specified number of results for a query"""
1277 while (50 * pagenum) < limit:
1278 self.report_download_page(query, pagenum+1)
1279 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1280 request = compat_urllib_request.Request(result_url)
1282 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1283 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1284 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1285 api_response = json.loads(data)['data']
1287 if not 'items' in api_response:
1288 raise ExtractorError(u'[youtube] No video results')
1290 new_ids = list(video['id'] for video in api_response['items'])
1291 video_ids += new_ids
1293 limit = min(n, api_response['totalItems'])
1296 if len(video_ids) > n:
1297 video_ids = video_ids[:n]
1298 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1299 return self.playlist_result(videos, query)
1302 class GoogleSearchIE(SearchInfoExtractor):
1303 """Information Extractor for Google Video search queries."""
1304 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1306 IE_NAME = u'video.google:search'
1307 _SEARCH_KEY = 'gvsearch'
1309 def _get_n_results(self, query, n):
1310 """Get a specified number of results for a query"""
1313 '_type': 'playlist',
1318 for pagenum in itertools.count(1):
1319 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1320 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1321 note='Downloading result page ' + str(pagenum))
1323 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1326 'url': mobj.group(1)
1328 res['entries'].append(e)
1330 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1333 class YahooSearchIE(SearchInfoExtractor):
1334 """Information Extractor for Yahoo! Video search queries."""
1337 IE_NAME = u'screen.yahoo:search'
1338 _SEARCH_KEY = 'yvsearch'
1340 def _get_n_results(self, query, n):
1341 """Get a specified number of results for a query"""
1344 '_type': 'playlist',
1348 for pagenum in itertools.count(0):
1349 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1350 webpage = self._download_webpage(result_url, query,
1351 note='Downloading results page '+str(pagenum+1))
1352 info = json.loads(webpage)
1354 results = info[u'results']
1356 for (i, r) in enumerate(results):
1357 if (pagenum * 30) +i >= n:
1359 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1360 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1361 res['entries'].append(e)
1362 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1368 class YoutubePlaylistIE(InfoExtractor):
1369 """Information Extractor for YouTube playlists."""
1371 _VALID_URL = r"""(?:
1376 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1377 \? (?:.*?&)*? (?:p|a|list)=
1380 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1383 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1385 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1387 IE_NAME = u'youtube:playlist'
1390 def suitable(cls, url):
1391 """Receives a URL and returns True if suitable for this IE."""
1392 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1394 def _real_extract(self, url):
1395 # Extract playlist id
1396 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1398 raise ExtractorError(u'Invalid URL: %s' % url)
1400 # Download playlist videos from API
1401 playlist_id = mobj.group(1) or mobj.group(2)
1406 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1407 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1410 response = json.loads(page)
1411 except ValueError as err:
1412 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1414 if 'feed' not in response:
1415 raise ExtractorError(u'Got a malformed response from YouTube API')
1416 playlist_title = response['feed']['title']['$t']
1417 if 'entry' not in response['feed']:
1418 # Number of videos is a multiple of self._MAX_RESULTS
1421 for entry in response['feed']['entry']:
1422 index = entry['yt$position']['$t']
1423 if 'media$group' in entry and 'media$player' in entry['media$group']:
1424 videos.append((index, entry['media$group']['media$player']['url']))
1426 if len(response['feed']['entry']) < self._MAX_RESULTS:
1430 videos = [v[1] for v in sorted(videos)]
1432 url_results = [self.url_result(url, 'Youtube') for url in videos]
1433 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1436 class YoutubeChannelIE(InfoExtractor):
1437 """Information Extractor for YouTube channels."""
1439 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1440 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1441 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1442 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1443 IE_NAME = u'youtube:channel'
1445 def extract_videos_from_page(self, page):
1447 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1448 if mobj.group(1) not in ids_in_page:
1449 ids_in_page.append(mobj.group(1))
1452 def _real_extract(self, url):
1453 # Extract channel id
1454 mobj = re.match(self._VALID_URL, url)
1456 raise ExtractorError(u'Invalid URL: %s' % url)
1458 # Download channel page
1459 channel_id = mobj.group(1)
1463 url = self._TEMPLATE_URL % (channel_id, pagenum)
1464 page = self._download_webpage(url, channel_id,
1465 u'Downloading page #%s' % pagenum)
1467 # Extract video identifiers
1468 ids_in_page = self.extract_videos_from_page(page)
1469 video_ids.extend(ids_in_page)
1471 # Download any subsequent channel pages using the json-based channel_ajax query
1472 if self._MORE_PAGES_INDICATOR in page:
1474 pagenum = pagenum + 1
1476 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1477 page = self._download_webpage(url, channel_id,
1478 u'Downloading page #%s' % pagenum)
1480 page = json.loads(page)
1482 ids_in_page = self.extract_videos_from_page(page['content_html'])
1483 video_ids.extend(ids_in_page)
1485 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1488 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1490 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1491 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1492 return [self.playlist_result(url_entries, channel_id)]
1495 class YoutubeUserIE(InfoExtractor):
1496 """Information Extractor for YouTube users."""
1498 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1499 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1500 _GDATA_PAGE_SIZE = 50
1501 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1502 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1503 IE_NAME = u'youtube:user'
1505 def _real_extract(self, url):
1507 mobj = re.match(self._VALID_URL, url)
1509 raise ExtractorError(u'Invalid URL: %s' % url)
1511 username = mobj.group(1)
1513 # Download video ids using YouTube Data API. Result size per
1514 # query is limited (currently to 50 videos) so we need to query
1515 # page by page until there are no video ids - it means we got
1522 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1524 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1525 page = self._download_webpage(gdata_url, username,
1526 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1528 # Extract video identifiers
1531 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1532 if mobj.group(1) not in ids_in_page:
1533 ids_in_page.append(mobj.group(1))
1535 video_ids.extend(ids_in_page)
1537 # A little optimization - if current page is not
1538 # "full", ie. does not contain PAGE_SIZE video ids then
1539 # we can assume that this page is the last one - there
1540 # are no more ids on further pages - no need to query
1543 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1548 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1549 url_results = [self.url_result(url, 'Youtube') for url in urls]
1550 return [self.playlist_result(url_results, playlist_title = username)]
1553 class BlipTVUserIE(InfoExtractor):
1554 """Information Extractor for blip.tv users."""
1556 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1558 IE_NAME = u'blip.tv:user'
1560 def _real_extract(self, url):
1562 mobj = re.match(self._VALID_URL, url)
1564 raise ExtractorError(u'Invalid URL: %s' % url)
1566 username = mobj.group(1)
1568 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1570 page = self._download_webpage(url, username, u'Downloading user page')
1571 mobj = re.search(r'data-users-id="([^"]+)"', page)
1572 page_base = page_base % mobj.group(1)
1575 # Download video ids using BlipTV Ajax calls. Result size per
1576 # query is limited (currently to 12 videos) so we need to query
1577 # page by page until there are no video ids - it means we got
1584 url = page_base + "&page=" + str(pagenum)
1585 page = self._download_webpage(url, username,
1586 u'Downloading video ids from page %d' % pagenum)
1588 # Extract video identifiers
1591 for mobj in re.finditer(r'href="/([^"]+)"', page):
1592 if mobj.group(1) not in ids_in_page:
1593 ids_in_page.append(unescapeHTML(mobj.group(1)))
1595 video_ids.extend(ids_in_page)
1597 # A little optimization - if current page is not
1598 # "full", ie. does not contain PAGE_SIZE video ids then
1599 # we can assume that this page is the last one - there
1600 # are no more ids on further pages - no need to query
1603 if len(ids_in_page) < self._PAGE_SIZE:
1608 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1609 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1610 return [self.playlist_result(url_entries, playlist_title = username)]
1613 class DepositFilesIE(InfoExtractor):
1614 """Information extractor for depositfiles.com"""
1616 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1618 def _real_extract(self, url):
1619 file_id = url.split('/')[-1]
1620 # Rebuild url in english locale
1621 url = 'http://depositfiles.com/en/files/' + file_id
1623 # Retrieve file webpage with 'Free download' button pressed
1624 free_download_indication = { 'gateway_result' : '1' }
1625 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1627 self.report_download_webpage(file_id)
1628 webpage = compat_urllib_request.urlopen(request).read()
1629 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1632 # Search for the real file URL
1633 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1634 if (mobj is None) or (mobj.group(1) is None):
1635 # Try to figure out reason of the error.
1636 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1637 if (mobj is not None) and (mobj.group(1) is not None):
1638 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1639 raise ExtractorError(u'%s' % restriction_message)
1641 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1643 file_url = mobj.group(1)
1644 file_extension = os.path.splitext(file_url)[1][1:]
1646 # Search for file title
1647 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1650 'id': file_id.decode('utf-8'),
1651 'url': file_url.decode('utf-8'),
1653 'upload_date': None,
1654 'title': file_title,
1655 'ext': file_extension.decode('utf-8'),
1659 class FacebookIE(InfoExtractor):
1660 """Information Extractor for Facebook"""
1662 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1663 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1664 _NETRC_MACHINE = 'facebook'
1665 IE_NAME = u'facebook'
1667 def report_login(self):
1668 """Report attempt to log in."""
1669 self.to_screen(u'Logging in')
1671 def _real_initialize(self):
1672 if self._downloader is None:
1677 downloader_params = self._downloader.params
1679 # Attempt to use provided username and password or .netrc data
1680 if downloader_params.get('username', None) is not None:
1681 useremail = downloader_params['username']
1682 password = downloader_params['password']
1683 elif downloader_params.get('usenetrc', False):
1685 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1686 if info is not None:
1690 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1691 except (IOError, netrc.NetrcParseError) as err:
1692 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1695 if useremail is None:
1704 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1707 login_results = compat_urllib_request.urlopen(request).read()
1708 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1709 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1711 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1715 def _real_extract(self, url):
1716 mobj = re.match(self._VALID_URL, url)
1718 raise ExtractorError(u'Invalid URL: %s' % url)
1719 video_id = mobj.group('ID')
1721 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1722 webpage = self._download_webpage(url, video_id)
1724 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1725 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1726 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1728 raise ExtractorError(u'Cannot parse data')
1729 data = dict(json.loads(m.group(1)))
1730 params_raw = compat_urllib_parse.unquote(data['params'])
1731 params = json.loads(params_raw)
1732 video_data = params['video_data'][0]
1733 video_url = video_data.get('hd_src')
1735 video_url = video_data['sd_src']
1737 raise ExtractorError(u'Cannot find video URL')
1738 video_duration = int(video_data['video_duration'])
1739 thumbnail = video_data['thumbnail_src']
1741 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1746 'title': video_title,
1749 'duration': video_duration,
1750 'thumbnail': thumbnail,
1755 class BlipTVIE(InfoExtractor):
1756 """Information extractor for blip.tv"""
1758 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1759 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1760 IE_NAME = u'blip.tv'
1762 def report_direct_download(self, title):
1763 """Report information extraction."""
1764 self.to_screen(u'%s: Direct download detected' % title)
1766 def _real_extract(self, url):
1767 mobj = re.match(self._VALID_URL, url)
1769 raise ExtractorError(u'Invalid URL: %s' % url)
1771 # See https://github.com/rg3/youtube-dl/issues/857
1772 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1773 if api_mobj is not None:
1774 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1775 urlp = compat_urllib_parse_urlparse(url)
1776 if urlp.path.startswith('/play/'):
1777 request = compat_urllib_request.Request(url)
1778 response = compat_urllib_request.urlopen(request)
1779 redirecturl = response.geturl()
1780 rurlp = compat_urllib_parse_urlparse(redirecturl)
1781 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1782 url = 'http://blip.tv/a/a-' + file_id
1783 return self._real_extract(url)
1790 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1791 request = compat_urllib_request.Request(json_url)
1792 request.add_header('User-Agent', 'iTunes/10.6.1')
1793 self.report_extraction(mobj.group(1))
1796 urlh = compat_urllib_request.urlopen(request)
1797 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1798 basename = url.split('/')[-1]
1799 title,ext = os.path.splitext(basename)
1800 title = title.decode('UTF-8')
1801 ext = ext.replace('.', '')
1802 self.report_direct_download(title)
1807 'upload_date': None,
1812 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1813 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1814 if info is None: # Regular URL
1816 json_code_bytes = urlh.read()
1817 json_code = json_code_bytes.decode('utf-8')
1818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1819 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1822 json_data = json.loads(json_code)
1823 if 'Post' in json_data:
1824 data = json_data['Post']
1828 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1829 video_url = data['media']['url']
1830 umobj = re.match(self._URL_EXT, video_url)
1832 raise ValueError('Can not determine filename extension')
1833 ext = umobj.group(1)
1836 'id': data['item_id'],
1838 'uploader': data['display_name'],
1839 'upload_date': upload_date,
1840 'title': data['title'],
1842 'format': data['media']['mimeType'],
1843 'thumbnail': data['thumbnailUrl'],
1844 'description': data['description'],
1845 'player_url': data['embedUrl'],
1846 'user_agent': 'iTunes/10.6.1',
1848 except (ValueError,KeyError) as err:
1849 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1854 class MyVideoIE(InfoExtractor):
1855 """Information Extractor for myvideo.de."""
1857 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1858 IE_NAME = u'myvideo'
1860 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1861 # Released into the Public Domain by Tristan Fischer on 2013-05-19
1862 # https://github.com/rg3/youtube-dl/pull/842
1863 def __rc4crypt(self,data, key):
1865 box = list(range(256))
1866 for i in list(range(256)):
1867 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1868 box[i], box[x] = box[x], box[i]
1874 y = (y + box[x]) % 256
1875 box[x], box[y] = box[y], box[x]
1876 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
1880 return hashlib.md5(s).hexdigest().encode()
1882 def _real_extract(self,url):
1883 mobj = re.match(self._VALID_URL, url)
1885 raise ExtractorError(u'invalid URL: %s' % url)
1887 video_id = mobj.group(1)
1890 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
1891 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
1892 b'TnpsbA0KTVRkbU1tSTRNdz09'
1896 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1897 webpage = self._download_webpage(webpage_url, video_id)
1899 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
1900 if mobj is not None:
1901 self.report_extraction(video_id)
1902 video_url = mobj.group(1) + '.flv'
1904 video_title = self._html_search_regex('<title>([^<]+)</title>',
1907 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
1913 'upload_date': None,
1914 'title': video_title,
1919 mobj = re.search('var flashvars={(.+?)}', webpage)
1921 raise ExtractorError(u'Unable to extract video')
1926 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
1927 if not a == '_encxml':
1930 encxml = compat_urllib_parse.unquote(b)
1931 if not params.get('domain'):
1932 params['domain'] = 'www.myvideo.de'
1933 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
1934 if 'flash_playertype=MTV' in xmldata_url:
1935 self._downloader.report_warning(u'avoiding MTV player')
1937 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
1938 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
1942 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
1943 enc_data_b = binascii.unhexlify(enc_data)
1945 base64.b64decode(base64.b64decode(GK)) +
1947 str(video_id).encode('utf-8')
1950 dec_data = self.__rc4crypt(enc_data_b, sk)
1953 self.report_extraction(video_id)
1956 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1958 video_url = compat_urllib_parse.unquote(mobj.group(1))
1959 if 'myvideo2flash' in video_url:
1960 self._downloader.report_warning(u'forcing RTMPT ...')
1961 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1964 # extract non rtmp videos
1965 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1967 raise ExtractorError(u'unable to extract url')
1968 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1970 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1971 video_file = compat_urllib_parse.unquote(video_file)
1973 if not video_file.endswith('f4m'):
1974 ppath, prefix = video_file.split('.')
1975 video_playpath = '%s:%s' % (prefix, ppath)
1976 video_hls_playlist = ''
1979 video_hls_playlist = (
1980 video_filepath + video_file
1981 ).replace('.f4m', '.m3u8')
1983 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1984 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1986 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1992 'tc_url': video_url,
1994 'upload_date': None,
1995 'title': video_title,
1997 'play_path': video_playpath,
1998 'video_file': video_file,
1999 'video_hls_playlist': video_hls_playlist,
2000 'player_url': video_swfobj,
2004 class ComedyCentralIE(InfoExtractor):
2005 """Information extractor for The Daily Show and Colbert Report """
2007 # urls can be abbreviations like :thedailyshow or :colbert
2008 # urls for episodes like:
2009 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2010 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2011 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2012 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2013 |(https?://)?(www\.)?
2014 (?P<showname>thedailyshow|colbertnation)\.com/
2015 (full-episodes/(?P<episode>.*)|
2017 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2018 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2021 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2023 _video_extensions = {
2031 _video_dimensions = {
2041 def suitable(cls, url):
2042 """Receives a URL and returns True if suitable for this IE."""
2043 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2045 def _print_formats(self, formats):
2046 print('Available formats:')
2048 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2051 def _real_extract(self, url):
2052 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2054 raise ExtractorError(u'Invalid URL: %s' % url)
2056 if mobj.group('shortname'):
2057 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2058 url = u'http://www.thedailyshow.com/full-episodes/'
2060 url = u'http://www.colbertnation.com/full-episodes/'
2061 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2062 assert mobj is not None
2064 if mobj.group('clip'):
2065 if mobj.group('showname') == 'thedailyshow':
2066 epTitle = mobj.group('tdstitle')
2068 epTitle = mobj.group('cntitle')
2071 dlNewest = not mobj.group('episode')
2073 epTitle = mobj.group('showname')
2075 epTitle = mobj.group('episode')
2077 self.report_extraction(epTitle)
2078 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2080 url = htmlHandle.geturl()
2081 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2083 raise ExtractorError(u'Invalid redirected URL: ' + url)
2084 if mobj.group('episode') == '':
2085 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2086 epTitle = mobj.group('episode')
2088 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2090 if len(mMovieParams) == 0:
2091 # The Colbert Report embeds the information in a without
2092 # a URL prefix; so extract the alternate reference
2093 # and then add the URL prefix manually.
2095 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2096 if len(altMovieParams) == 0:
2097 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2099 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2101 uri = mMovieParams[0][1]
2102 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2103 indexXml = self._download_webpage(indexUrl, epTitle,
2104 u'Downloading show index',
2105 u'unable to download episode index')
2109 idoc = xml.etree.ElementTree.fromstring(indexXml)
2110 itemEls = idoc.findall('.//item')
2111 for partNum,itemEl in enumerate(itemEls):
2112 mediaId = itemEl.findall('./guid')[0].text
2113 shortMediaId = mediaId.split(':')[-1]
2114 showId = mediaId.split(':')[-2].replace('.com', '')
2115 officialTitle = itemEl.findall('./title')[0].text
2116 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2118 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2119 compat_urllib_parse.urlencode({'uri': mediaId}))
2120 configXml = self._download_webpage(configUrl, epTitle,
2121 u'Downloading configuration for %s' % shortMediaId)
2123 cdoc = xml.etree.ElementTree.fromstring(configXml)
2125 for rendition in cdoc.findall('.//rendition'):
2126 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2130 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2133 if self._downloader.params.get('listformats', None):
2134 self._print_formats([i[0] for i in turls])
2137 # For now, just pick the highest bitrate
2138 format,rtmp_video_url = turls[-1]
2140 # Get the format arg from the arg stream
2141 req_format = self._downloader.params.get('format', None)
2143 # Select format if we can find one
2146 format, rtmp_video_url = f, v
2149 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2151 raise ExtractorError(u'Cannot transform RTMP url')
2152 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2153 video_url = base + m.group('finalid')
2155 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2160 'upload_date': officialDate,
2165 'description': officialTitle,
2167 results.append(info)
2172 class EscapistIE(InfoExtractor):
2173 """Information extractor for The Escapist """
2175 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2176 IE_NAME = u'escapist'
2178 def _real_extract(self, url):
2179 mobj = re.match(self._VALID_URL, url)
2181 raise ExtractorError(u'Invalid URL: %s' % url)
2182 showName = mobj.group('showname')
2183 videoId = mobj.group('episode')
2185 self.report_extraction(videoId)
2186 webpage = self._download_webpage(url, videoId)
2188 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2189 webpage, u'description', fatal=False)
2191 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2192 webpage, u'thumbnail', fatal=False)
2194 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2195 webpage, u'player url')
2197 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2198 webpage, u'player url').split(' : ')[-1]
2200 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2201 configUrl = compat_urllib_parse.unquote(configUrl)
2203 configJSON = self._download_webpage(configUrl, videoId,
2204 u'Downloading configuration',
2205 u'unable to download configuration')
2207 # Technically, it's JavaScript, not JSON
2208 configJSON = configJSON.replace("'", '"')
2211 config = json.loads(configJSON)
2212 except (ValueError,) as err:
2213 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2215 playlist = config['playlist']
2216 videoUrl = playlist[1]['url']
2221 'uploader': showName,
2222 'upload_date': None,
2225 'thumbnail': imgUrl,
2226 'description': videoDesc,
2227 'player_url': playerUrl,
2232 class CollegeHumorIE(InfoExtractor):
2233 """Information extractor for collegehumor.com"""
2236 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2237 IE_NAME = u'collegehumor'
2239 def report_manifest(self, video_id):
2240 """Report information extraction."""
2241 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2243 def _real_extract(self, url):
2244 mobj = re.match(self._VALID_URL, url)
2246 raise ExtractorError(u'Invalid URL: %s' % url)
2247 video_id = mobj.group('videoid')
2252 'upload_date': None,
2255 self.report_extraction(video_id)
2256 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2258 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2259 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2260 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2262 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2264 videoNode = mdoc.findall('./video')[0]
2265 info['description'] = videoNode.findall('./description')[0].text
2266 info['title'] = videoNode.findall('./caption')[0].text
2267 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2268 manifest_url = videoNode.findall('./file')[0].text
2270 raise ExtractorError(u'Invalid metadata XML file')
2272 manifest_url += '?hdcore=2.10.3'
2273 self.report_manifest(video_id)
2275 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2276 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2277 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2279 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2281 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2282 node_id = media_node.attrib['url']
2283 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2284 except IndexError as err:
2285 raise ExtractorError(u'Invalid manifest file')
2287 url_pr = compat_urllib_parse_urlparse(manifest_url)
2288 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2295 class XVideosIE(InfoExtractor):
2296 """Information extractor for xvideos.com"""
2298 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2299 IE_NAME = u'xvideos'
2301 def _real_extract(self, url):
2302 mobj = re.match(self._VALID_URL, url)
2304 raise ExtractorError(u'Invalid URL: %s' % url)
2305 video_id = mobj.group(1)
2307 webpage = self._download_webpage(url, video_id)
2309 self.report_extraction(video_id)
2312 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2313 webpage, u'video URL'))
2316 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2319 # Extract video thumbnail
2320 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2321 webpage, u'thumbnail', fatal=False)
2327 'upload_date': None,
2328 'title': video_title,
2330 'thumbnail': video_thumbnail,
2331 'description': None,
2337 class SoundcloudIE(InfoExtractor):
2338 """Information extractor for soundcloud.com
2339 To access the media, the uid of the song and a stream token
2340 must be extracted from the page source and the script must make
2341 a request to media.soundcloud.com/crossdomain.xml. Then
2342 the media can be grabbed by requesting from an url composed
2343 of the stream token and uid
2346 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2347 IE_NAME = u'soundcloud'
2349 def report_resolve(self, video_id):
2350 """Report information extraction."""
2351 self.to_screen(u'%s: Resolving id' % video_id)
2353 def _real_extract(self, url):
2354 mobj = re.match(self._VALID_URL, url)
2356 raise ExtractorError(u'Invalid URL: %s' % url)
2358 # extract uploader (which is in the url)
2359 uploader = mobj.group(1)
2360 # extract simple title (uploader + slug of song title)
2361 slug_title = mobj.group(2)
2362 simple_title = uploader + u'-' + slug_title
2363 full_title = '%s/%s' % (uploader, slug_title)
2365 self.report_resolve(full_title)
2367 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2368 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2369 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2371 info = json.loads(info_json)
2372 video_id = info['id']
2373 self.report_extraction(full_title)
2375 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2376 stream_json = self._download_webpage(streams_url, full_title,
2377 u'Downloading stream definitions',
2378 u'unable to download stream definitions')
2380 streams = json.loads(stream_json)
2381 mediaURL = streams['http_mp3_128_url']
2382 upload_date = unified_strdate(info['created_at'])
2387 'uploader': info['user']['username'],
2388 'upload_date': upload_date,
2389 'title': info['title'],
2391 'description': info['description'],
2394 class SoundcloudSetIE(InfoExtractor):
2395 """Information extractor for soundcloud.com sets
2396 To access the media, the uid of the song and a stream token
2397 must be extracted from the page source and the script must make
2398 a request to media.soundcloud.com/crossdomain.xml. Then
2399 the media can be grabbed by requesting from an url composed
2400 of the stream token and uid
2403 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2404 IE_NAME = u'soundcloud:set'
2406 def report_resolve(self, video_id):
2407 """Report information extraction."""
2408 self.to_screen(u'%s: Resolving id' % video_id)
2410 def _real_extract(self, url):
2411 mobj = re.match(self._VALID_URL, url)
2413 raise ExtractorError(u'Invalid URL: %s' % url)
2415 # extract uploader (which is in the url)
2416 uploader = mobj.group(1)
2417 # extract simple title (uploader + slug of song title)
2418 slug_title = mobj.group(2)
2419 simple_title = uploader + u'-' + slug_title
2420 full_title = '%s/sets/%s' % (uploader, slug_title)
2422 self.report_resolve(full_title)
2424 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2425 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2426 info_json = self._download_webpage(resolv_url, full_title)
2429 info = json.loads(info_json)
2430 if 'errors' in info:
2431 for err in info['errors']:
2432 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2435 self.report_extraction(full_title)
2436 for track in info['tracks']:
2437 video_id = track['id']
2439 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2440 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2442 self.report_extraction(video_id)
2443 streams = json.loads(stream_json)
2444 mediaURL = streams['http_mp3_128_url']
2449 'uploader': track['user']['username'],
2450 'upload_date': unified_strdate(track['created_at']),
2451 'title': track['title'],
2453 'description': track['description'],
2458 class InfoQIE(InfoExtractor):
2459 """Information extractor for infoq.com"""
2460 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2462 def _real_extract(self, url):
2463 mobj = re.match(self._VALID_URL, url)
2465 raise ExtractorError(u'Invalid URL: %s' % url)
2467 webpage = self._download_webpage(url, video_id=url)
2468 self.report_extraction(url)
2471 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2473 raise ExtractorError(u'Unable to extract video url')
2474 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2475 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2478 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2481 # Extract description
2482 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2483 webpage, u'description', fatal=False)
2485 video_filename = video_url.split('/')[-1]
2486 video_id, extension = video_filename.split('.')
2492 'upload_date': None,
2493 'title': video_title,
2494 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2496 'description': video_description,
2501 class MixcloudIE(InfoExtractor):
2502 """Information extractor for www.mixcloud.com"""
2504 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2505 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2506 IE_NAME = u'mixcloud'
2508 def report_download_json(self, file_id):
2509 """Report JSON download."""
2510 self.to_screen(u'Downloading json')
2512 def get_urls(self, jsonData, fmt, bitrate='best'):
2513 """Get urls from 'audio_formats' section in json"""
2516 bitrate_list = jsonData[fmt]
2517 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2518 bitrate = max(bitrate_list) # select highest
2520 url_list = jsonData[fmt][bitrate]
2521 except TypeError: # we have no bitrate info.
2522 url_list = jsonData[fmt]
2525 def check_urls(self, url_list):
2526 """Returns 1st active url from list"""
2527 for url in url_list:
2529 compat_urllib_request.urlopen(url)
2531 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2536 def _print_formats(self, formats):
2537 print('Available formats:')
2538 for fmt in formats.keys():
2539 for b in formats[fmt]:
2541 ext = formats[fmt][b][0]
2542 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2543 except TypeError: # we have no bitrate info
2544 ext = formats[fmt][0]
2545 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2551 raise ExtractorError(u'Invalid URL: %s' % url)
2552 # extract uploader & filename from url
2553 uploader = mobj.group(1).decode('utf-8')
2554 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2556 # construct API request
2557 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2558 # retrieve .json file with links to files
2559 request = compat_urllib_request.Request(file_url)
2561 self.report_download_json(file_url)
2562 jsonData = compat_urllib_request.urlopen(request).read()
2563 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2564 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2567 json_data = json.loads(jsonData)
2568 player_url = json_data['player_swf_url']
2569 formats = dict(json_data['audio_formats'])
2571 req_format = self._downloader.params.get('format', None)
2574 if self._downloader.params.get('listformats', None):
2575 self._print_formats(formats)
2578 if req_format is None or req_format == 'best':
2579 for format_param in formats.keys():
2580 url_list = self.get_urls(formats, format_param)
2582 file_url = self.check_urls(url_list)
2583 if file_url is not None:
2586 if req_format not in formats:
2587 raise ExtractorError(u'Format is not available')
2589 url_list = self.get_urls(formats, req_format)
2590 file_url = self.check_urls(url_list)
2591 format_param = req_format
2594 'id': file_id.decode('utf-8'),
2595 'url': file_url.decode('utf-8'),
2596 'uploader': uploader.decode('utf-8'),
2597 'upload_date': None,
2598 'title': json_data['name'],
2599 'ext': file_url.split('.')[-1].decode('utf-8'),
2600 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2601 'thumbnail': json_data['thumbnail_url'],
2602 'description': json_data['description'],
2603 'player_url': player_url.decode('utf-8'),
2606 class StanfordOpenClassroomIE(InfoExtractor):
2607 """Information extractor for Stanford's Open ClassRoom"""
2609 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2610 IE_NAME = u'stanfordoc'
2612 def _real_extract(self, url):
2613 mobj = re.match(self._VALID_URL, url)
2615 raise ExtractorError(u'Invalid URL: %s' % url)
2617 if mobj.group('course') and mobj.group('video'): # A specific video
2618 course = mobj.group('course')
2619 video = mobj.group('video')
2621 'id': course + '_' + video,
2623 'upload_date': None,
2626 self.report_extraction(info['id'])
2627 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2628 xmlUrl = baseUrl + video + '.xml'
2630 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2631 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2632 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2633 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2635 info['title'] = mdoc.findall('./title')[0].text
2636 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2638 raise ExtractorError(u'Invalid metadata XML file')
2639 info['ext'] = info['url'].rpartition('.')[2]
2641 elif mobj.group('course'): # A course page
2642 course = mobj.group('course')
2647 'upload_date': None,
2650 coursepage = self._download_webpage(url, info['id'],
2651 note='Downloading course info page',
2652 errnote='Unable to download course info page')
2654 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2656 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2657 coursepage, u'description', fatal=False)
2659 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2662 'type': 'reference',
2663 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2667 for entry in info['list']:
2668 assert entry['type'] == 'reference'
2669 results += self.extract(entry['url'])
2673 'id': 'Stanford OpenClassroom',
2676 'upload_date': None,
2679 self.report_download_webpage(info['id'])
2680 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2682 rootpage = compat_urllib_request.urlopen(rootURL).read()
2683 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2684 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2686 info['title'] = info['id']
2688 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2691 'type': 'reference',
2692 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2697 for entry in info['list']:
2698 assert entry['type'] == 'reference'
2699 results += self.extract(entry['url'])
2702 class MTVIE(InfoExtractor):
2703 """Information extractor for MTV.com"""
2705 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2708 def _real_extract(self, url):
2709 mobj = re.match(self._VALID_URL, url)
2711 raise ExtractorError(u'Invalid URL: %s' % url)
2712 if not mobj.group('proto'):
2713 url = 'http://' + url
2714 video_id = mobj.group('videoid')
2716 webpage = self._download_webpage(url, video_id)
2718 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2719 webpage, u'song name', fatal=False)
2721 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2724 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2725 webpage, u'mtvn_uri', fatal=False)
2727 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2728 webpage, u'content id', fatal=False)
2730 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2731 self.report_extraction(video_id)
2732 request = compat_urllib_request.Request(videogen_url)
2734 metadataXml = compat_urllib_request.urlopen(request).read()
2735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2738 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2739 renditions = mdoc.findall('.//rendition')
2741 # For now, always pick the highest quality.
2742 rendition = renditions[-1]
2745 _,_,ext = rendition.attrib['type'].partition('/')
2746 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2747 video_url = rendition.find('./src').text
2749 raise ExtractorError('Invalid rendition field.')
2754 'uploader': performer,
2755 'upload_date': None,
2756 'title': video_title,
2764 class YoukuIE(InfoExtractor):
2765 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2768 nowTime = int(time.time() * 1000)
2769 random1 = random.randint(1000,1998)
2770 random2 = random.randint(1000,9999)
2772 return "%d%d%d" %(nowTime,random1,random2)
2774 def _get_file_ID_mix_string(self, seed):
2776 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2778 for i in range(len(source)):
2779 seed = (seed * 211 + 30031 ) % 65536
2780 index = math.floor(seed / 65536 * len(source) )
2781 mixed.append(source[int(index)])
2782 source.remove(source[int(index)])
2783 #return ''.join(mixed)
2786 def _get_file_id(self, fileId, seed):
2787 mixed = self._get_file_ID_mix_string(seed)
2788 ids = fileId.split('*')
2792 realId.append(mixed[int(ch)])
2793 return ''.join(realId)
2795 def _real_extract(self, url):
2796 mobj = re.match(self._VALID_URL, url)
2798 raise ExtractorError(u'Invalid URL: %s' % url)
2799 video_id = mobj.group('ID')
2801 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2803 jsondata = self._download_webpage(info_url, video_id)
2805 self.report_extraction(video_id)
2807 config = json.loads(jsondata)
2809 video_title = config['data'][0]['title']
2810 seed = config['data'][0]['seed']
2812 format = self._downloader.params.get('format', None)
2813 supported_format = list(config['data'][0]['streamfileids'].keys())
2815 if format is None or format == 'best':
2816 if 'hd2' in supported_format:
2821 elif format == 'worst':
2829 fileid = config['data'][0]['streamfileids'][format]
2830 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2831 except (UnicodeDecodeError, ValueError, KeyError):
2832 raise ExtractorError(u'Unable to extract info section')
2835 sid = self._gen_sid()
2836 fileid = self._get_file_id(fileid, seed)
2838 #column 8,9 of fileid represent the segment number
2839 #fileid[7:9] should be changed
2840 for index, key in enumerate(keys):
2842 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2843 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2846 'id': '%s_part%02d' % (video_id, index),
2847 'url': download_url,
2849 'upload_date': None,
2850 'title': video_title,
2853 files_info.append(info)
2858 class XNXXIE(InfoExtractor):
2859 """Information extractor for xnxx.com"""
2861 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2863 VIDEO_URL_RE = r'flv_url=(.*?)&'
2864 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2865 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2867 def _real_extract(self, url):
2868 mobj = re.match(self._VALID_URL, url)
2870 raise ExtractorError(u'Invalid URL: %s' % url)
2871 video_id = mobj.group(1)
2873 # Get webpage content
2874 webpage = self._download_webpage(url, video_id)
2876 video_url = self._search_regex(self.VIDEO_URL_RE,
2877 webpage, u'video URL')
2878 video_url = compat_urllib_parse.unquote(video_url)
2880 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
2883 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
2884 webpage, u'thumbnail', fatal=False)
2890 'upload_date': None,
2891 'title': video_title,
2893 'thumbnail': video_thumbnail,
2894 'description': None,
2898 class GooglePlusIE(InfoExtractor):
2899 """Information extractor for plus.google.com."""
2901 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2902 IE_NAME = u'plus.google'
2904 def _real_extract(self, url):
2905 # Extract id from URL
2906 mobj = re.match(self._VALID_URL, url)
2908 raise ExtractorError(u'Invalid URL: %s' % url)
2910 post_url = mobj.group(0)
2911 video_id = mobj.group(1)
2913 video_extension = 'flv'
2915 # Step 1, Retrieve post webpage to extract further information
2916 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2918 self.report_extraction(video_id)
2920 # Extract update date
2921 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
2922 webpage, u'upload date', fatal=False)
2924 # Convert timestring to a format suitable for filename
2925 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2926 upload_date = upload_date.strftime('%Y%m%d')
2929 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
2930 webpage, u'uploader', fatal=False)
2933 # Get the first line for title
2934 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
2935 webpage, 'title', default=u'NA')
2937 # Step 2, Stimulate clicking the image box to launch video
2938 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
2939 webpage, u'video page URL')
2940 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
2942 # Extract video links on video page
2943 """Extract video links of all sizes"""
2944 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
2945 mobj = re.findall(pattern, webpage)
2947 raise ExtractorError(u'Unable to extract video links')
2949 # Sort in resolution
2950 links = sorted(mobj)
2952 # Choose the lowest of the sort, i.e. highest resolution
2953 video_url = links[-1]
2954 # Only get the url. The resolution part in the tuple has no use anymore
2955 video_url = video_url[-1]
2956 # Treat escaped \u0026 style hex
2958 video_url = video_url.decode("unicode_escape")
2959 except AttributeError: # Python 3
2960 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2966 'uploader': uploader,
2967 'upload_date': upload_date,
2968 'title': video_title,
2969 'ext': video_extension,
2972 class NBAIE(InfoExtractor):
2973 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2976 def _real_extract(self, url):
2977 mobj = re.match(self._VALID_URL, url)
2979 raise ExtractorError(u'Invalid URL: %s' % url)
2981 video_id = mobj.group(1)
2983 webpage = self._download_webpage(url, video_id)
2985 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2987 shortened_video_id = video_id.rpartition('/')[2]
2988 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2989 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2991 # It isn't there in the HTML it returns to us
2992 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2994 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2997 'id': shortened_video_id,
3001 # 'uploader_date': uploader_date,
3002 'description': description,
3006 class JustinTVIE(InfoExtractor):
3007 """Information extractor for justin.tv and twitch.tv"""
3008 # TODO: One broadcast may be split into multiple videos. The key
3009 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3010 # starts at 1 and increases. Can we treat all parts as one video?
3012 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3014 (?P<channelid>[^/]+)|
3015 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3016 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3020 _JUSTIN_PAGE_LIMIT = 100
3021 IE_NAME = u'justin.tv'
3023 def report_download_page(self, channel, offset):
3024 """Report attempt to download a single page of videos."""
3025 self.to_screen(u'%s: Downloading video information from %d to %d' %
3026 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3028 # Return count of items, list of *valid* items
3029 def _parse_page(self, url, video_id):
3030 webpage = self._download_webpage(url, video_id,
3031 u'Downloading video info JSON',
3032 u'unable to download video info JSON')
3034 response = json.loads(webpage)
3035 if type(response) != list:
3036 error_text = response.get('error', 'unknown error')
3037 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3039 for clip in response:
3040 video_url = clip['video_file_url']
3042 video_extension = os.path.splitext(video_url)[1][1:]
3043 video_date = re.sub('-', '', clip['start_time'][:10])
3044 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3045 video_id = clip['id']
3046 video_title = clip.get('title', video_id)
3050 'title': video_title,
3051 'uploader': clip.get('channel_name', video_uploader_id),
3052 'uploader_id': video_uploader_id,
3053 'upload_date': video_date,
3054 'ext': video_extension,
3056 return (len(response), info)
3058 def _real_extract(self, url):
3059 mobj = re.match(self._VALID_URL, url)
3061 raise ExtractorError(u'invalid URL: %s' % url)
3063 api_base = 'http://api.justin.tv'
3065 if mobj.group('channelid'):
3067 video_id = mobj.group('channelid')
3068 api = api_base + '/channel/archives/%s.json' % video_id
3069 elif mobj.group('chapterid'):
3070 chapter_id = mobj.group('chapterid')
3072 webpage = self._download_webpage(url, chapter_id)
3073 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3075 raise ExtractorError(u'Cannot find archive of a chapter')
3076 archive_id = m.group(1)
3078 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3079 chapter_info_xml = self._download_webpage(api, chapter_id,
3080 note=u'Downloading chapter information',
3081 errnote=u'Chapter information download failed')
3082 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3083 for a in doc.findall('.//archive'):
3084 if archive_id == a.find('./id').text:
3087 raise ExtractorError(u'Could not find chapter in chapter information')
3089 video_url = a.find('./video_file_url').text
3090 video_ext = video_url.rpartition('.')[2] or u'flv'
3092 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3093 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3094 note='Downloading chapter metadata',
3095 errnote='Download of chapter metadata failed')
3096 chapter_info = json.loads(chapter_info_json)
3098 bracket_start = int(doc.find('.//bracket_start').text)
3099 bracket_end = int(doc.find('.//bracket_end').text)
3101 # TODO determine start (and probably fix up file)
3102 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3103 #video_url += u'?start=' + TODO:start_timestamp
3104 # bracket_start is 13290, but we want 51670615
3105 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3106 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3109 'id': u'c' + chapter_id,
3112 'title': chapter_info['title'],
3113 'thumbnail': chapter_info['preview'],
3114 'description': chapter_info['description'],
3115 'uploader': chapter_info['channel']['display_name'],
3116 'uploader_id': chapter_info['channel']['name'],
3120 video_id = mobj.group('videoid')
3121 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3123 self.report_extraction(video_id)
3127 limit = self._JUSTIN_PAGE_LIMIT
3130 self.report_download_page(video_id, offset)
3131 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3132 page_count, page_info = self._parse_page(page_url, video_id)
3133 info.extend(page_info)
3134 if not paged or page_count != limit:
3139 class FunnyOrDieIE(InfoExtractor):
3140 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3142 def _real_extract(self, url):
3143 mobj = re.match(self._VALID_URL, url)
3145 raise ExtractorError(u'invalid URL: %s' % url)
3147 video_id = mobj.group('id')
3148 webpage = self._download_webpage(url, video_id)
3150 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3151 webpage, u'video URL', flags=re.DOTALL)
3153 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3154 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3156 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3157 webpage, u'description', fatal=False, flags=re.DOTALL)
3164 'description': video_description,
3168 class SteamIE(InfoExtractor):
3169 _VALID_URL = r"""http://store\.steampowered\.com/
3171 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3173 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3175 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3176 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3179 def suitable(cls, url):
3180 """Receives a URL and returns True if suitable for this IE."""
3181 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3183 def _real_extract(self, url):
3184 m = re.match(self._VALID_URL, url, re.VERBOSE)
3185 gameID = m.group('gameID')
3187 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3188 webpage = self._download_webpage(videourl, gameID)
3190 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3191 videourl = self._AGECHECK_TEMPLATE % gameID
3192 self.report_age_confirmation()
3193 webpage = self._download_webpage(videourl, gameID)
3195 self.report_extraction(gameID)
3196 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3197 webpage, 'game title')
3199 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3200 mweb = re.finditer(urlRE, webpage)
3201 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3202 titles = re.finditer(namesRE, webpage)
3203 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3204 thumbs = re.finditer(thumbsRE, webpage)
3206 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3207 video_id = vid.group('videoID')
3208 title = vtitle.group('videoName')
3209 video_url = vid.group('videoURL')
3210 video_thumb = thumb.group('thumbnail')
3212 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3217 'title': unescapeHTML(title),
3218 'thumbnail': video_thumb
3221 return [self.playlist_result(videos, gameID, game_title)]
3223 class UstreamIE(InfoExtractor):
3224 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3225 IE_NAME = u'ustream'
3227 def _real_extract(self, url):
3228 m = re.match(self._VALID_URL, url)
3229 video_id = m.group('videoID')
3231 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3232 webpage = self._download_webpage(url, video_id)
3234 self.report_extraction(video_id)
3236 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3239 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3240 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3242 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3243 webpage, u'thumbnail', fatal=False)
3249 'title': video_title,
3250 'uploader': uploader,
3251 'thumbnail': thumbnail,
3255 class WorldStarHipHopIE(InfoExtractor):
3256 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3257 IE_NAME = u'WorldStarHipHop'
3259 def _real_extract(self, url):
3260 m = re.match(self._VALID_URL, url)
3261 video_id = m.group('id')
3263 webpage_src = self._download_webpage(url, video_id)
3265 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3266 webpage_src, u'video URL')
3268 if 'mp4' in video_url:
3273 video_title = self._html_search_regex(r"<title>(.*)</title>",
3274 webpage_src, u'title')
3276 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3277 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3278 webpage_src, u'thumbnail', fatal=False)
3281 _title = r"""candytitles.*>(.*)</span>"""
3282 mobj = re.search(_title, webpage_src)
3283 if mobj is not None:
3284 video_title = mobj.group(1)
3289 'title' : video_title,
3290 'thumbnail' : thumbnail,
3295 class RBMARadioIE(InfoExtractor):
3296 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3298 def _real_extract(self, url):
3299 m = re.match(self._VALID_URL, url)
3300 video_id = m.group('videoID')
3302 webpage = self._download_webpage(url, video_id)
3304 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3305 webpage, u'json data', flags=re.MULTILINE)
3308 data = json.loads(json_data)
3309 except ValueError as e:
3310 raise ExtractorError(u'Invalid JSON: ' + str(e))
3312 video_url = data['akamai_url'] + '&cbr=256'
3313 url_parts = compat_urllib_parse_urlparse(video_url)
3314 video_ext = url_parts.path.rpartition('.')[2]
3319 'title': data['title'],
3320 'description': data.get('teaser_text'),
3321 'location': data.get('country_of_origin'),
3322 'uploader': data.get('host', {}).get('name'),
3323 'uploader_id': data.get('host', {}).get('slug'),
3324 'thumbnail': data.get('image', {}).get('large_url_2x'),
3325 'duration': data.get('duration'),
3330 class YouPornIE(InfoExtractor):
3331 """Information extractor for youporn.com."""
3332 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3334 def _print_formats(self, formats):
3335 """Print all available formats"""
3336 print(u'Available formats:')
3337 print(u'ext\t\tformat')
3338 print(u'---------------------------------')
3339 for format in formats:
3340 print(u'%s\t\t%s' % (format['ext'], format['format']))
3342 def _specific(self, req_format, formats):
3344 if(x["format"]==req_format):
3348 def _real_extract(self, url):
3349 mobj = re.match(self._VALID_URL, url)
3351 raise ExtractorError(u'Invalid URL: %s' % url)
3352 video_id = mobj.group('videoid')
3354 req = compat_urllib_request.Request(url)
3355 req.add_header('Cookie', 'age_verified=1')
3356 webpage = self._download_webpage(req, video_id)
3358 # Get JSON parameters
3359 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3361 params = json.loads(json_params)
3363 raise ExtractorError(u'Invalid JSON')
3365 self.report_extraction(video_id)
3367 video_title = params['title']
3368 upload_date = unified_strdate(params['release_date_f'])
3369 video_description = params['description']
3370 video_uploader = params['submitted_by']
3371 thumbnail = params['thumbnails'][0]['image']
3373 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3375 # Get all of the formats available
3376 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3377 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3378 webpage, u'download list').strip()
3380 # Get all of the links from the page
3381 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3382 links = re.findall(LINK_RE, download_list_html)
3383 if(len(links) == 0):
3384 raise ExtractorError(u'ERROR: no known formats available for video')
3386 self.to_screen(u'Links found: %d' % len(links))
3391 # A link looks like this:
3392 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3393 # A path looks like this:
3394 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3395 video_url = unescapeHTML( link )
3396 path = compat_urllib_parse_urlparse( video_url ).path
3397 extension = os.path.splitext( path )[1][1:]
3398 format = path.split('/')[4].split('_')[:2]
3401 format = "-".join( format )
3402 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3407 'uploader': video_uploader,
3408 'upload_date': upload_date,
3409 'title': video_title,
3412 'thumbnail': thumbnail,
3413 'description': video_description
3416 if self._downloader.params.get('listformats', None):
3417 self._print_formats(formats)
3420 req_format = self._downloader.params.get('format', None)
3421 self.to_screen(u'Format: %s' % req_format)
3423 if req_format is None or req_format == 'best':
3425 elif req_format == 'worst':
3426 return [formats[-1]]
3427 elif req_format in ('-1', 'all'):
3430 format = self._specific( req_format, formats )
3432 raise ExtractorError(u'Requested format not available')
3437 class PornotubeIE(InfoExtractor):
3438 """Information extractor for pornotube.com."""
3439 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3441 def _real_extract(self, url):
3442 mobj = re.match(self._VALID_URL, url)
3444 raise ExtractorError(u'Invalid URL: %s' % url)
3446 video_id = mobj.group('videoid')
3447 video_title = mobj.group('title')
3449 # Get webpage content
3450 webpage = self._download_webpage(url, video_id)
3453 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3454 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3455 video_url = compat_urllib_parse.unquote(video_url)
3457 #Get the uploaded date
3458 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3459 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3460 if upload_date: upload_date = unified_strdate(upload_date)
3462 info = {'id': video_id,
3465 'upload_date': upload_date,
3466 'title': video_title,
3472 class YouJizzIE(InfoExtractor):
3473 """Information extractor for youjizz.com."""
3474 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3476 def _real_extract(self, url):
3477 mobj = re.match(self._VALID_URL, url)
3479 raise ExtractorError(u'Invalid URL: %s' % url)
3481 video_id = mobj.group('videoid')
3483 # Get webpage content
3484 webpage = self._download_webpage(url, video_id)
3486 # Get the video title
3487 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3488 webpage, u'title').strip()
3490 # Get the embed page
3491 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3493 raise ExtractorError(u'ERROR: unable to extract embed page')
3495 embed_page_url = result.group(0).strip()
3496 video_id = result.group('videoid')
3498 webpage = self._download_webpage(embed_page_url, video_id)
3501 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3502 webpage, u'video URL')
3504 info = {'id': video_id,
3506 'title': video_title,
3509 'player_url': embed_page_url}
3513 class EightTracksIE(InfoExtractor):
3515 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3517 def _real_extract(self, url):
3518 mobj = re.match(self._VALID_URL, url)
3520 raise ExtractorError(u'Invalid URL: %s' % url)
3521 playlist_id = mobj.group('id')
3523 webpage = self._download_webpage(url, playlist_id)
3525 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3526 data = json.loads(json_like)
3528 session = str(random.randint(0, 1000000000))
3530 track_count = data['tracks_count']
3531 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3532 next_url = first_url
3534 for i in itertools.count():
3535 api_json = self._download_webpage(next_url, playlist_id,
3536 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3537 errnote=u'Failed to download song information')
3538 api_data = json.loads(api_json)
3539 track_data = api_data[u'set']['track']
3541 'id': track_data['id'],
3542 'url': track_data['track_file_stream_url'],
3543 'title': track_data['performer'] + u' - ' + track_data['name'],
3544 'raw_title': track_data['name'],
3545 'uploader_id': data['user']['login'],
3549 if api_data['set']['at_last_track']:
3551 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3554 class KeekIE(InfoExtractor):
3555 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3558 def _real_extract(self, url):
3559 m = re.match(self._VALID_URL, url)
3560 video_id = m.group('videoID')
3562 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3563 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3564 webpage = self._download_webpage(url, video_id)
3566 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3569 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3570 webpage, u'uploader', fatal=False)
3576 'title': video_title,
3577 'thumbnail': thumbnail,
3578 'uploader': uploader
3582 class TEDIE(InfoExtractor):
3583 _VALID_URL=r'''http://www\.ted\.com/
3585 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3587 ((?P<type_talk>talks)) # We have a simple talk
3589 (/lang/(.*?))? # The url may contain the language
3590 /(?P<name>\w+) # Here goes the name and then ".html"
3594 def suitable(cls, url):
3595 """Receives a URL and returns True if suitable for this IE."""
3596 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3598 def _real_extract(self, url):
3599 m=re.match(self._VALID_URL, url, re.VERBOSE)
3600 if m.group('type_talk'):
3601 return [self._talk_info(url)]
3603 playlist_id=m.group('playlist_id')
3604 name=m.group('name')
3605 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3606 return [self._playlist_videos_info(url,name,playlist_id)]
3608 def _playlist_videos_info(self,url,name,playlist_id=0):
3609 '''Returns the videos of the playlist'''
3611 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3612 ([.\s]*?)data-playlist_item_id="(\d+)"
3613 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3615 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3616 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3617 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3618 m_names=re.finditer(video_name_RE,webpage)
3620 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3621 webpage, 'playlist title')
3623 playlist_entries = []
3624 for m_video, m_name in zip(m_videos,m_names):
3625 video_id=m_video.group('video_id')
3626 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3627 playlist_entries.append(self.url_result(talk_url, 'TED'))
3628 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3630 def _talk_info(self, url, video_id=0):
3631 """Return the video for the talk in the url"""
3632 m = re.match(self._VALID_URL, url,re.VERBOSE)
3633 video_name = m.group('name')
3634 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3635 self.report_extraction(video_name)
3636 # If the url includes the language we get the title translated
3637 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3639 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3640 webpage, 'json data')
3641 info = json.loads(json_data)
3642 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3643 webpage, 'description', flags = re.DOTALL)
3645 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3646 webpage, 'thumbnail')
3649 'url': info['htmlStreams'][-1]['file'],
3652 'thumbnail': thumbnail,
3653 'description': desc,
3657 class MySpassIE(InfoExtractor):
3658 _VALID_URL = r'http://www.myspass.de/.*'
3660 def _real_extract(self, url):
3661 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3663 # video id is the last path element of the URL
3664 # usually there is a trailing slash, so also try the second but last
3665 url_path = compat_urllib_parse_urlparse(url).path
3666 url_parent_path, video_id = os.path.split(url_path)
3668 _, video_id = os.path.split(url_parent_path)
3671 metadata_url = META_DATA_URL_TEMPLATE % video_id
3672 metadata_text = self._download_webpage(metadata_url, video_id)
3673 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3675 # extract values from metadata
3676 url_flv_el = metadata.find('url_flv')
3677 if url_flv_el is None:
3678 raise ExtractorError(u'Unable to extract download url')
3679 video_url = url_flv_el.text
3680 extension = os.path.splitext(video_url)[1][1:]
3681 title_el = metadata.find('title')
3682 if title_el is None:
3683 raise ExtractorError(u'Unable to extract title')
3684 title = title_el.text
3685 format_id_el = metadata.find('format_id')
3686 if format_id_el is None:
3689 format = format_id_el.text
3690 description_el = metadata.find('description')
3691 if description_el is not None:
3692 description = description_el.text
3695 imagePreview_el = metadata.find('imagePreview')
3696 if imagePreview_el is not None:
3697 thumbnail = imagePreview_el.text
3706 'thumbnail': thumbnail,
3707 'description': description
3711 class SpiegelIE(InfoExtractor):
3712 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3714 def _real_extract(self, url):
3715 m = re.match(self._VALID_URL, url)
3716 video_id = m.group('videoID')
3718 webpage = self._download_webpage(url, video_id)
3720 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3723 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3724 xml_code = self._download_webpage(xml_url, video_id,
3725 note=u'Downloading XML', errnote=u'Failed to download XML')
3727 idoc = xml.etree.ElementTree.fromstring(xml_code)
3728 last_type = idoc[-1]
3729 filename = last_type.findall('./filename')[0].text
3730 duration = float(last_type.findall('./duration')[0].text)
3732 video_url = 'http://video2.spiegel.de/flash/' + filename
3733 video_ext = filename.rpartition('.')[2]
3738 'title': video_title,
3739 'duration': duration,
3743 class LiveLeakIE(InfoExtractor):
3745 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3746 IE_NAME = u'liveleak'
3748 def _real_extract(self, url):
3749 mobj = re.match(self._VALID_URL, url)
3751 raise ExtractorError(u'Invalid URL: %s' % url)
3753 video_id = mobj.group('video_id')
3755 webpage = self._download_webpage(url, video_id)
3757 video_url = self._search_regex(r'file: "(.*?)",',
3758 webpage, u'video URL')
3760 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3761 webpage, u'title').replace('LiveLeak.com -', '').strip()
3763 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3764 webpage, u'description', fatal=False)
3766 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3767 webpage, u'uploader', fatal=False)
3773 'title': video_title,
3774 'description': video_description,
3775 'uploader': video_uploader
3780 class ARDIE(InfoExtractor):
3781 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3782 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3783 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3785 def _real_extract(self, url):
3786 # determine video id from url
3787 m = re.match(self._VALID_URL, url)
3789 numid = re.search(r'documentId=([0-9]+)', url)
3791 video_id = numid.group(1)
3793 video_id = m.group('video_id')
3795 # determine title and media streams from webpage
3796 html = self._download_webpage(url, video_id)
3797 title = re.search(self._TITLE, html).group('title')
3798 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3800 assert '"fsk"' in html
3801 raise ExtractorError(u'This video is only available after 8:00 pm')
3803 # choose default media type and highest quality for now
3804 stream = max([s for s in streams if int(s["media_type"]) == 0],
3805 key=lambda s: int(s["quality"]))
3807 # there's two possibilities: RTMP stream or HTTP download
3808 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3809 if stream['rtmp_url']:
3810 self.to_screen(u'RTMP download detected')
3811 assert stream['video_url'].startswith('mp4:')
3812 info["url"] = stream["rtmp_url"]
3813 info["play_path"] = stream['video_url']
3815 assert stream["video_url"].endswith('.mp4')
3816 info["url"] = stream["video_url"]
3819 class ZDFIE(InfoExtractor):
3820 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3821 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
3822 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
3823 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
3824 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
3826 def _real_extract(self, url):
3827 mobj = re.match(self._VALID_URL, url)
3829 raise ExtractorError(u'Invalid URL: %s' % url)
3830 video_id = mobj.group('video_id')
3832 html = self._download_webpage(url, video_id)
3833 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3835 raise ExtractorError(u'No media url found.')
3837 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
3838 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
3839 # choose first/default media type and highest quality for now
3840 for s in streams: #find 300 - dsl1000mbit
3841 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
3844 for s in streams: #find veryhigh - dsl2000mbit
3845 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
3849 raise ExtractorError(u'No stream found.')
3851 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
3853 self.report_extraction(video_id)
3854 mobj = re.search(self._TITLE, html)
3856 raise ExtractorError(u'Cannot extract title')
3857 title = unescapeHTML(mobj.group('title'))
3859 mobj = re.search(self._MMS_STREAM, media_link)
3861 mobj = re.search(self._RTSP_STREAM, media_link)
3863 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
3864 mms_url = mobj.group('video_url')
3866 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
3868 raise ExtractorError(u'Cannot extract extention')
3869 ext = mobj.group('ext')
3871 return [{'id': video_id,
3877 class TumblrIE(InfoExtractor):
3878 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3880 def _real_extract(self, url):
3881 m_url = re.match(self._VALID_URL, url)
3882 video_id = m_url.group('id')
3883 blog = m_url.group('blog_name')
3885 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3886 webpage = self._download_webpage(url, video_id)
3888 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3889 video = re.search(re_video, webpage)
3891 raise ExtractorError(u'Unable to extract video')
3892 video_url = video.group('video_url')
3893 ext = video.group('ext')
3895 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
3896 webpage, u'thumbnail', fatal=False) # We pick the first poster
3897 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
3899 # The only place where you can get a title, it's not complete,
3900 # but searching in other places doesn't work for all videos
3901 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
3902 webpage, u'title', flags=re.DOTALL)
3904 return [{'id': video_id,
3906 'title': video_title,
3907 'thumbnail': video_thumbnail,
3911 class BandcampIE(InfoExtractor):
3912 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3914 def _real_extract(self, url):
3915 mobj = re.match(self._VALID_URL, url)
3916 title = mobj.group('title')
3917 webpage = self._download_webpage(url, title)
3918 # We get the link to the free download page
3919 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3920 if m_download is None:
3921 raise ExtractorError(u'No free songs found')
3923 download_link = m_download.group(1)
3924 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3925 webpage, re.MULTILINE|re.DOTALL).group('id')
3927 download_webpage = self._download_webpage(download_link, id,
3928 'Downloading free downloads page')
3929 # We get the dictionary of the track from some javascrip code
3930 info = re.search(r'items: (.*?),$',
3931 download_webpage, re.MULTILINE).group(1)
3932 info = json.loads(info)[0]
3933 # We pick mp3-320 for now, until format selection can be easily implemented.
3934 mp3_info = info[u'downloads'][u'mp3-320']
3935 # If we try to use this url it says the link has expired
3936 initial_url = mp3_info[u'url']
3937 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3938 m_url = re.match(re_url, initial_url)
3939 #We build the url we will use to get the final track url
3940 # This url is build in Bandcamp in the script download_bunde_*.js
3941 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3942 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3943 # If we could correctly generate the .rand field the url would be
3944 #in the "download_url" key
3945 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3947 track_info = {'id':id,
3948 'title' : info[u'title'],
3951 'thumbnail' : info[u'thumb_url'],
3952 'uploader' : info[u'artist']
3957 class RedTubeIE(InfoExtractor):
3958 """Information Extractor for redtube"""
3959 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3961 def _real_extract(self,url):
3962 mobj = re.match(self._VALID_URL, url)
3964 raise ExtractorError(u'Invalid URL: %s' % url)
3966 video_id = mobj.group('id')
3967 video_extension = 'mp4'
3968 webpage = self._download_webpage(url, video_id)
3970 self.report_extraction(video_id)
3972 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3973 webpage, u'video URL')
3975 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3981 'ext': video_extension,
3982 'title': video_title,
3985 class InaIE(InfoExtractor):
3986 """Information Extractor for Ina.fr"""
3987 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3989 def _real_extract(self,url):
3990 mobj = re.match(self._VALID_URL, url)
3992 video_id = mobj.group('id')
3993 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3994 video_extension = 'mp4'
3995 webpage = self._download_webpage(mrss_url, video_id)
3997 self.report_extraction(video_id)
3999 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4000 webpage, u'video URL')
4002 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4008 'ext': video_extension,
4009 'title': video_title,
4012 class HowcastIE(InfoExtractor):
4013 """Information Extractor for Howcast.com"""
4014 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4016 def _real_extract(self, url):
4017 mobj = re.match(self._VALID_URL, url)
4019 video_id = mobj.group('id')
4020 webpage_url = 'http://www.howcast.com/videos/' + video_id
4021 webpage = self._download_webpage(webpage_url, video_id)
4023 self.report_extraction(video_id)
4025 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4026 webpage, u'video URL')
4028 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4031 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4032 webpage, u'description', fatal=False)
4034 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4035 webpage, u'thumbnail', fatal=False)
4041 'title': video_title,
4042 'description': video_description,
4043 'thumbnail': thumbnail,
4046 class VineIE(InfoExtractor):
4047 """Information Extractor for Vine.co"""
4048 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4050 def _real_extract(self, url):
4051 mobj = re.match(self._VALID_URL, url)
4053 video_id = mobj.group('id')
4054 webpage_url = 'https://vine.co/v/' + video_id
4055 webpage = self._download_webpage(webpage_url, video_id)
4057 self.report_extraction(video_id)
4059 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4060 webpage, u'video URL')
4062 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4065 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4066 webpage, u'thumbnail', fatal=False)
4068 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4069 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4075 'title': video_title,
4076 'thumbnail': thumbnail,
4077 'uploader': uploader,
4080 class FlickrIE(InfoExtractor):
4081 """Information Extractor for Flickr videos"""
4082 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4084 def _real_extract(self, url):
4085 mobj = re.match(self._VALID_URL, url)
4087 video_id = mobj.group('id')
4088 video_uploader_id = mobj.group('uploader_id')
4089 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4090 webpage = self._download_webpage(webpage_url, video_id)
4092 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4094 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4095 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4097 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4098 first_xml, u'node_id')
4100 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4101 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4103 self.report_extraction(video_id)
4105 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4107 raise ExtractorError(u'Unable to extract video url')
4108 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4110 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4111 webpage, u'video title')
4113 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4114 webpage, u'description', fatal=False)
4116 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4117 webpage, u'thumbnail', fatal=False)
4123 'title': video_title,
4124 'description': video_description,
4125 'thumbnail': thumbnail,
4126 'uploader_id': video_uploader_id,
4129 class TeamcocoIE(InfoExtractor):
4130 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4132 def _real_extract(self, url):
4133 mobj = re.match(self._VALID_URL, url)
4135 raise ExtractorError(u'Invalid URL: %s' % url)
4136 url_title = mobj.group('url_title')
4137 webpage = self._download_webpage(url, url_title)
4139 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4140 webpage, u'video id')
4142 self.report_extraction(video_id)
4144 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4147 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4148 webpage, u'thumbnail', fatal=False)
4150 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4151 webpage, u'description', fatal=False)
4153 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4154 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4156 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4163 'title': video_title,
4164 'thumbnail': thumbnail,
4165 'description': video_description,
4168 class XHamsterIE(InfoExtractor):
4169 """Information Extractor for xHamster"""
4170 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4172 def _real_extract(self,url):
4173 mobj = re.match(self._VALID_URL, url)
4175 video_id = mobj.group('id')
4176 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4177 webpage = self._download_webpage(mrss_url, video_id)
4179 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4181 raise ExtractorError(u'Unable to extract media URL')
4182 if len(mobj.group('server')) == 0:
4183 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4185 video_url = mobj.group('server')+'/key='+mobj.group('file')
4186 video_extension = video_url.split('.')[-1]
4188 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4191 # Can't see the description anywhere in the UI
4192 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4193 # webpage, u'description', fatal=False)
4194 # if video_description: video_description = unescapeHTML(video_description)
4196 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4198 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4200 video_upload_date = None
4201 self._downloader.report_warning(u'Unable to extract upload date')
4203 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4204 webpage, u'uploader id', default=u'anonymous')
4206 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4207 webpage, u'thumbnail', fatal=False)
4212 'ext': video_extension,
4213 'title': video_title,
4214 # 'description': video_description,
4215 'upload_date': video_upload_date,
4216 'uploader_id': video_uploader_id,
4217 'thumbnail': video_thumbnail
4220 class HypemIE(InfoExtractor):
4221 """Information Extractor for hypem"""
4222 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4224 def _real_extract(self, url):
4225 mobj = re.match(self._VALID_URL, url)
4227 raise ExtractorError(u'Invalid URL: %s' % url)
4228 track_id = mobj.group(1)
4230 data = { 'ax': 1, 'ts': time.time() }
4231 data_encoded = compat_urllib_parse.urlencode(data)
4232 complete_url = url + "?" + data_encoded
4233 request = compat_urllib_request.Request(complete_url)
4234 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4235 cookie = urlh.headers.get('Set-Cookie', '')
4237 self.report_extraction(track_id)
4239 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4240 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4242 track_list = json.loads(html_tracks)
4243 track = track_list[u'tracks'][0]
4245 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4248 track_id = track[u"id"]
4249 artist = track[u"artist"]
4250 title = track[u"song"]
4252 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4253 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4254 request.add_header('cookie', cookie)
4255 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4257 song_data = json.loads(song_data_json)
4259 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4260 final_url = song_data[u"url"]
4270 class Vbox7IE(InfoExtractor):
4271 """Information Extractor for Vbox7"""
4272 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4274 def _real_extract(self,url):
4275 mobj = re.match(self._VALID_URL, url)
4277 raise ExtractorError(u'Invalid URL: %s' % url)
4278 video_id = mobj.group(1)
4280 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4281 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4282 redirect_url = urlh.geturl() + new_location
4283 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4285 title = self._html_search_regex(r'<title>(.*)</title>',
4286 webpage, u'title').split('/')[0].strip()
4289 info_url = "http://vbox7.com/play/magare.do"
4290 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4291 info_request = compat_urllib_request.Request(info_url, data)
4292 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4293 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4294 if info_response is None:
4295 raise ExtractorError(u'Unable to extract the media url')
4296 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4303 'thumbnail': thumbnail_url,
4306 class GametrailersIE(InfoExtractor):
4307 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4309 def _real_extract(self, url):
4310 mobj = re.match(self._VALID_URL, url)
4312 raise ExtractorError(u'Invalid URL: %s' % url)
4313 video_id = mobj.group('id')
4314 video_type = mobj.group('type')
4315 webpage = self._download_webpage(url, video_id)
4316 if video_type == 'full-episodes':
4317 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4319 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4320 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4321 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4323 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4324 video_id, u'Downloading video info')
4325 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4326 video_id, u'Downloading video urls info')
4328 self.report_extraction(video_id)
4329 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4330 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4332 <url>(?P<thumb>.*?)</url>.*
4335 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4337 raise ExtractorError(u'Unable to extract video info')
4338 video_title = m_info.group('title')
4339 video_description = m_info.group('description')
4340 video_thumb = m_info.group('thumb')
4342 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4343 if m_urls is None or len(m_urls) == 0:
4344 raise ExtractError(u'Unable to extrat video url')
4345 # They are sorted from worst to best quality
4346 video_url = m_urls[-1].group('url')
4348 return {'url': video_url,
4350 'title': video_title,
4351 # Videos are actually flv not mp4
4353 'thumbnail': video_thumb,
4354 'description': video_description,
4357 class StatigramIE(InfoExtractor):
4358 _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4360 def _real_extract(self, url):
4361 mobj = re.match(self._VALID_URL, url)
4363 video_id = mobj.group(1)
4364 webpage = self._download_webpage(url, video_id)
4365 video_url = self._html_search_regex(
4366 r'<meta property="og:video:secure_url" content="(.+?)">',
4367 webpage, u'video URL')
4368 thumbnail_url = self._html_search_regex(
4369 r'<meta property="og:image" content="(.+?)" />',
4370 webpage, u'thumbnail URL', fatal=False)
4371 html_title = self._html_search_regex(
4372 r'<title>(.+?)</title>',
4374 title = html_title.rpartition(u' | Statigram')[0]
4375 uploader_id = self._html_search_regex(
4376 r'@([^ ]+)', title, u'uploader name', fatal=False)
4384 'thumbnail': thumbnail_url,
4385 'uploader_id' : uploader_id
4388 def gen_extractors():
4389 """ Return a list of an instance of every supported extractor.
4390 The order does matter; the first extractor matched is the one handling the URL.
4393 YoutubePlaylistIE(),
4418 StanfordOpenClassroomIE(),
4428 WorldStarHipHopIE(),
4458 def get_info_extractor(ie_name):
4459 """Returns the info extractor class with the given ie_name"""
4460 return globals()[ie_name+'IE']