2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
614 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
615 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
616 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
618 format_limit = self._downloader.params.get('format_limit', None)
619 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
620 if format_limit is not None and format_limit in available_formats:
621 format_list = available_formats[available_formats.index(format_limit):]
623 format_list = available_formats
624 existing_formats = [x for x in format_list if x in url_map]
625 if len(existing_formats) == 0:
626 raise ExtractorError(u'no known formats available for video')
627 if self._downloader.params.get('listformats', None):
628 self._print_formats(existing_formats)
630 if req_format is None or req_format == 'best':
631 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
632 elif req_format == 'worst':
633 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
634 elif req_format in ('-1', 'all'):
635 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
637 # Specific formats. We pick the first in a slash-delimeted sequence.
638 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
639 req_formats = req_format.split('/')
640 video_url_list = None
641 for rf in req_formats:
643 video_url_list = [(rf, url_map[rf])]
645 if video_url_list is None:
646 raise ExtractorError(u'requested format not available')
648 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
651 for format_param, video_real_url in video_url_list:
653 video_extension = self._video_extensions.get(format_param, 'flv')
655 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
656 self._video_dimensions.get(format_param, '???'))
660 'url': video_real_url,
661 'uploader': video_uploader,
662 'uploader_id': video_uploader_id,
663 'upload_date': upload_date,
664 'title': video_title,
665 'ext': video_extension,
666 'format': video_format,
667 'thumbnail': video_thumbnail,
668 'description': video_description,
669 'player_url': player_url,
670 'subtitles': video_subtitles,
671 'duration': video_duration
676 class MetacafeIE(InfoExtractor):
677 """Information Extractor for metacafe.com."""
679 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
680 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
681 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
682 IE_NAME = u'metacafe'
684 def report_disclaimer(self):
685 """Report disclaimer retrieval."""
686 self.to_screen(u'Retrieving disclaimer')
688 def _real_initialize(self):
689 # Retrieve disclaimer
690 request = compat_urllib_request.Request(self._DISCLAIMER)
692 self.report_disclaimer()
693 disclaimer = compat_urllib_request.urlopen(request).read()
694 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
695 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
700 'submit': "Continue - I'm over 18",
702 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
704 self.report_age_confirmation()
705 disclaimer = compat_urllib_request.urlopen(request).read()
706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
707 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
709 def _real_extract(self, url):
710 # Extract id and simplified title from URL
711 mobj = re.match(self._VALID_URL, url)
713 raise ExtractorError(u'Invalid URL: %s' % url)
715 video_id = mobj.group(1)
717 # Check if video comes from YouTube
718 mobj2 = re.match(r'^yt-(.*)$', video_id)
719 if mobj2 is not None:
720 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
722 # Retrieve video webpage to extract further information
723 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
725 # Extract URL, uploader and title from webpage
726 self.report_extraction(video_id)
727 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
729 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
730 video_extension = mediaURL[-3:]
732 # Extract gdaKey if available
733 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737 gdaKey = mobj.group(1)
738 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
740 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
742 raise ExtractorError(u'Unable to extract media URL')
743 vardict = compat_parse_qs(mobj.group(1))
744 if 'mediaData' not in vardict:
745 raise ExtractorError(u'Unable to extract media URL')
746 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
748 raise ExtractorError(u'Unable to extract media URL')
749 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
750 video_extension = mediaURL[-3:]
751 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
753 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
755 raise ExtractorError(u'Unable to extract title')
756 video_title = mobj.group(1).decode('utf-8')
758 mobj = re.search(r'submitter=(.*?);', webpage)
760 raise ExtractorError(u'Unable to extract uploader nickname')
761 video_uploader = mobj.group(1)
764 'id': video_id.decode('utf-8'),
765 'url': video_url.decode('utf-8'),
766 'uploader': video_uploader.decode('utf-8'),
768 'title': video_title,
769 'ext': video_extension.decode('utf-8'),
772 class DailymotionIE(InfoExtractor):
773 """Information Extractor for Dailymotion"""
775 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
776 IE_NAME = u'dailymotion'
778 def _real_extract(self, url):
779 # Extract id and simplified title from URL
780 mobj = re.match(self._VALID_URL, url)
782 raise ExtractorError(u'Invalid URL: %s' % url)
784 video_id = mobj.group(1).split('_')[0].split('?')[0]
786 video_extension = 'mp4'
788 # Retrieve video webpage to extract further information
789 request = compat_urllib_request.Request(url)
790 request.add_header('Cookie', 'family_filter=off')
791 webpage = self._download_webpage(request, video_id)
793 # Extract URL, uploader and title from webpage
794 self.report_extraction(video_id)
795 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
797 raise ExtractorError(u'Unable to extract media URL')
798 flashvars = compat_urllib_parse.unquote(mobj.group(1))
800 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
803 self.to_screen(u'Using %s' % key)
806 raise ExtractorError(u'Unable to extract video URL')
808 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
810 raise ExtractorError(u'Unable to extract video URL')
812 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
814 # TODO: support choosing qualities
816 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
818 raise ExtractorError(u'Unable to extract title')
819 video_title = unescapeHTML(mobj.group('title'))
821 video_uploader = None
822 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
824 # lookin for official user
825 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
826 if mobj_official is None:
827 self._downloader.report_warning(u'unable to extract uploader nickname')
829 video_uploader = mobj_official.group(1)
831 video_uploader = mobj.group(1)
833 video_upload_date = None
834 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
836 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
841 'uploader': video_uploader,
842 'upload_date': video_upload_date,
843 'title': video_title,
844 'ext': video_extension,
848 class PhotobucketIE(InfoExtractor):
849 """Information extractor for photobucket.com."""
851 # TODO: the original _VALID_URL was:
852 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
853 # Check if it's necessary to keep the old extracion process
854 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
855 IE_NAME = u'photobucket'
857 def _real_extract(self, url):
858 # Extract id from URL
859 mobj = re.match(self._VALID_URL, url)
861 raise ExtractorError(u'Invalid URL: %s' % url)
863 video_id = mobj.group('id')
865 video_extension = mobj.group('ext')
867 # Retrieve video webpage to extract further information
868 webpage = self._download_webpage(url, video_id)
870 # Extract URL, uploader, and title from webpage
871 self.report_extraction(video_id)
872 # We try first by looking the javascript code:
873 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
875 info = json.loads(mobj.group('json'))
878 'url': info[u'downloadUrl'],
879 'uploader': info[u'username'],
880 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
881 'title': info[u'title'],
882 'ext': video_extension,
883 'thumbnail': info[u'thumbUrl'],
886 # We try looking in other parts of the webpage
887 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
889 raise ExtractorError(u'Unable to extract media URL')
890 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
894 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
896 raise ExtractorError(u'Unable to extract title')
897 video_title = mobj.group(1).decode('utf-8')
899 video_uploader = mobj.group(2).decode('utf-8')
902 'id': video_id.decode('utf-8'),
903 'url': video_url.decode('utf-8'),
904 'uploader': video_uploader,
906 'title': video_title,
907 'ext': video_extension.decode('utf-8'),
911 class YahooIE(InfoExtractor):
912 """Information extractor for video.yahoo.com."""
915 # _VALID_URL matches all Yahoo! Video URLs
916 # _VPAGE_URL matches only the extractable '/watch/' URLs
917 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
918 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
919 IE_NAME = u'video.yahoo'
921 def _real_extract(self, url, new_video=True):
922 # Extract ID from URL
923 mobj = re.match(self._VALID_URL, url)
925 raise ExtractorError(u'Invalid URL: %s' % url)
927 video_id = mobj.group(2)
928 video_extension = 'flv'
930 # Rewrite valid but non-extractable URLs as
931 # extractable English language /watch/ URLs
932 if re.match(self._VPAGE_URL, url) is None:
933 request = compat_urllib_request.Request(url)
935 webpage = compat_urllib_request.urlopen(request).read()
936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
937 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
939 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
941 raise ExtractorError(u'Unable to extract id field')
942 yahoo_id = mobj.group(1)
944 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
946 raise ExtractorError(u'Unable to extract vid field')
947 yahoo_vid = mobj.group(1)
949 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
950 return self._real_extract(url, new_video=False)
952 # Retrieve video webpage to extract further information
953 request = compat_urllib_request.Request(url)
955 self.report_download_webpage(video_id)
956 webpage = compat_urllib_request.urlopen(request).read()
957 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
958 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
960 # Extract uploader and title from webpage
961 self.report_extraction(video_id)
962 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
964 raise ExtractorError(u'Unable to extract video title')
965 video_title = mobj.group(1).decode('utf-8')
967 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
969 raise ExtractorError(u'Unable to extract video uploader')
970 video_uploader = mobj.group(1).decode('utf-8')
972 # Extract video thumbnail
973 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
975 raise ExtractorError(u'Unable to extract video thumbnail')
976 video_thumbnail = mobj.group(1).decode('utf-8')
978 # Extract video description
979 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
981 raise ExtractorError(u'Unable to extract video description')
982 video_description = mobj.group(1).decode('utf-8')
983 if not video_description:
984 video_description = 'No description available.'
986 # Extract video height and width
987 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
989 raise ExtractorError(u'Unable to extract video height')
990 yv_video_height = mobj.group(1)
992 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
994 raise ExtractorError(u'Unable to extract video width')
995 yv_video_width = mobj.group(1)
997 # Retrieve video playlist to extract media URL
998 # I'm not completely sure what all these options are, but we
999 # seem to need most of them, otherwise the server sends a 401.
1000 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1001 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1002 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1003 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1004 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1006 self.report_download_webpage(video_id)
1007 webpage = compat_urllib_request.urlopen(request).read()
1008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1009 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1011 # Extract media URL from playlist XML
1012 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1014 raise ExtractorError(u'Unable to extract media URL')
1015 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1016 video_url = unescapeHTML(video_url)
1019 'id': video_id.decode('utf-8'),
1021 'uploader': video_uploader,
1022 'upload_date': None,
1023 'title': video_title,
1024 'ext': video_extension.decode('utf-8'),
1025 'thumbnail': video_thumbnail.decode('utf-8'),
1026 'description': video_description,
1030 class VimeoIE(InfoExtractor):
1031 """Information extractor for vimeo.com."""
1033 # _VALID_URL matches Vimeo URLs
1034 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1037 def _real_extract(self, url, new_video=True):
1038 # Extract ID from URL
1039 mobj = re.match(self._VALID_URL, url)
1041 raise ExtractorError(u'Invalid URL: %s' % url)
1043 video_id = mobj.group('id')
1044 if not mobj.group('proto'):
1045 url = 'https://' + url
1046 if mobj.group('direct_link'):
1047 url = 'https://vimeo.com/' + video_id
1049 # Retrieve video webpage to extract further information
1050 request = compat_urllib_request.Request(url, None, std_headers)
1051 webpage = self._download_webpage(request, video_id)
1053 # Now we begin extracting as much information as we can from what we
1054 # retrieved. First we extract the information common to all extractors,
1055 # and latter we extract those that are Vimeo specific.
1056 self.report_extraction(video_id)
1058 # Extract the config JSON
1060 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1061 config = json.loads(config)
1063 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1064 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1066 raise ExtractorError(u'Unable to extract info section')
1069 video_title = config["video"]["title"]
1071 # Extract uploader and uploader_id
1072 video_uploader = config["video"]["owner"]["name"]
1073 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1075 # Extract video thumbnail
1076 video_thumbnail = config["video"]["thumbnail"]
1078 # Extract video description
1079 video_description = get_element_by_attribute("itemprop", "description", webpage)
1080 if video_description: video_description = clean_html(video_description)
1081 else: video_description = u''
1083 # Extract upload date
1084 video_upload_date = None
1085 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1086 if mobj is not None:
1087 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1089 # Vimeo specific: extract request signature and timestamp
1090 sig = config['request']['signature']
1091 timestamp = config['request']['timestamp']
1093 # Vimeo specific: extract video codec and quality information
1094 # First consider quality, then codecs, then take everything
1095 # TODO bind to format param
1096 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1097 files = { 'hd': [], 'sd': [], 'other': []}
1098 for codec_name, codec_extension in codecs:
1099 if codec_name in config["video"]["files"]:
1100 if 'hd' in config["video"]["files"][codec_name]:
1101 files['hd'].append((codec_name, codec_extension, 'hd'))
1102 elif 'sd' in config["video"]["files"][codec_name]:
1103 files['sd'].append((codec_name, codec_extension, 'sd'))
1105 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1107 for quality in ('hd', 'sd', 'other'):
1108 if len(files[quality]) > 0:
1109 video_quality = files[quality][0][2]
1110 video_codec = files[quality][0][0]
1111 video_extension = files[quality][0][1]
1112 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1115 raise ExtractorError(u'No known codec found')
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1133 class ArteTvIE(InfoExtractor):
1134 """arte.tv information extractor."""
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1139 IE_NAME = u'arte.tv'
1141 def fetch_webpage(self, url):
1142 request = compat_urllib_request.Request(url)
1144 self.report_download_webpage(url)
1145 webpage = compat_urllib_request.urlopen(request).read()
1146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1147 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1148 except ValueError as err:
1149 raise ExtractorError(u'Invalid URL: %s' % url)
1152 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1153 page = self.fetch_webpage(url)
1154 mobj = re.search(regex, page, regexFlags)
1158 raise ExtractorError(u'Invalid URL: %s' % url)
1160 for (i, key, err) in matchTuples:
1161 if mobj.group(i) is None:
1162 raise ExtractorError(err)
1164 info[key] = mobj.group(i)
1168 def extractLiveStream(self, url):
1169 video_lang = url.split('/')[-4]
1170 info = self.grep_webpage(
1172 r'src="(.*?/videothek_js.*?\.js)',
1175 (1, 'url', u'Invalid URL: %s' % url)
1178 http_host = url.split('/')[2]
1179 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1180 info = self.grep_webpage(
1182 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1183 '(http://.*?\.swf).*?' +
1187 (1, 'path', u'could not extract video path: %s' % url),
1188 (2, 'player', u'could not extract video player: %s' % url),
1189 (3, 'url', u'could not extract video url: %s' % url)
1192 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1194 def extractPlus7Stream(self, url):
1195 video_lang = url.split('/')[-3]
1196 info = self.grep_webpage(
1198 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1201 (1, 'url', u'Invalid URL: %s' % url)
1204 next_url = compat_urllib_parse.unquote(info.get('url'))
1205 info = self.grep_webpage(
1207 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1210 (1, 'url', u'Could not find <video> tag: %s' % url)
1213 next_url = compat_urllib_parse.unquote(info.get('url'))
1215 info = self.grep_webpage(
1217 r'<video id="(.*?)".*?>.*?' +
1218 '<name>(.*?)</name>.*?' +
1219 '<dateVideo>(.*?)</dateVideo>.*?' +
1220 '<url quality="hd">(.*?)</url>',
1223 (1, 'id', u'could not extract video id: %s' % url),
1224 (2, 'title', u'could not extract video title: %s' % url),
1225 (3, 'date', u'could not extract video date: %s' % url),
1226 (4, 'url', u'could not extract video url: %s' % url)
1231 'id': info.get('id'),
1232 'url': compat_urllib_parse.unquote(info.get('url')),
1233 'uploader': u'arte.tv',
1234 'upload_date': unified_strdate(info.get('date')),
1235 'title': info.get('title').decode('utf-8'),
1241 def _real_extract(self, url):
1242 video_id = url.split('/')[-1]
1243 self.report_extraction(video_id)
1245 if re.search(self._LIVE_URL, video_id) is not None:
1246 self.extractLiveStream(url)
1249 info = self.extractPlus7Stream(url)
1254 class GenericIE(InfoExtractor):
1255 """Generic last-resort information extractor."""
1258 IE_NAME = u'generic'
1260 def report_download_webpage(self, video_id):
1261 """Report webpage download."""
1262 if not self._downloader.params.get('test', False):
1263 self._downloader.report_warning(u'Falling back on generic information extractor.')
1264 super(GenericIE, self).report_download_webpage(video_id)
1266 def report_following_redirect(self, new_url):
1267 """Report information extraction."""
1268 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1270 def _test_redirect(self, url):
1271 """Check if it is a redirect, like url shorteners, in case return the new url."""
1272 class HeadRequest(compat_urllib_request.Request):
1273 def get_method(self):
1276 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1278 Subclass the HTTPRedirectHandler to make it use our
1279 HeadRequest also on the redirected URL
1281 def redirect_request(self, req, fp, code, msg, headers, newurl):
1282 if code in (301, 302, 303, 307):
1283 newurl = newurl.replace(' ', '%20')
1284 newheaders = dict((k,v) for k,v in req.headers.items()
1285 if k.lower() not in ("content-length", "content-type"))
1286 return HeadRequest(newurl,
1288 origin_req_host=req.get_origin_req_host(),
1291 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1293 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1295 Fallback to GET if HEAD is not allowed (405 HTTP error)
1297 def http_error_405(self, req, fp, code, msg, headers):
1301 newheaders = dict((k,v) for k,v in req.headers.items()
1302 if k.lower() not in ("content-length", "content-type"))
1303 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1305 origin_req_host=req.get_origin_req_host(),
1309 opener = compat_urllib_request.OpenerDirector()
1310 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1311 HTTPMethodFallback, HEADRedirectHandler,
1312 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1313 opener.add_handler(handler())
1315 response = opener.open(HeadRequest(url))
1316 new_url = response.geturl()
1321 self.report_following_redirect(new_url)
1324 def _real_extract(self, url):
1325 new_url = self._test_redirect(url)
1326 if new_url: return [self.url_result(new_url)]
1328 video_id = url.split('/')[-1]
1330 webpage = self._download_webpage(url, video_id)
1331 except ValueError as err:
1332 # since this is the last-resort InfoExtractor, if
1333 # this error is thrown, it'll be thrown here
1334 raise ExtractorError(u'Invalid URL: %s' % url)
1336 self.report_extraction(video_id)
1337 # Start with something easy: JW Player in SWFObject
1338 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1340 # Broaden the search a little bit
1341 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1343 # Broaden the search a little bit: JWPlayer JS loader
1344 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1346 raise ExtractorError(u'Invalid URL: %s' % url)
1348 # It's possible that one of the regexes
1349 # matched, but returned an empty group:
1350 if mobj.group(1) is None:
1351 raise ExtractorError(u'Invalid URL: %s' % url)
1353 video_url = compat_urllib_parse.unquote(mobj.group(1))
1354 video_id = os.path.basename(video_url)
1356 # here's a fun little line of code for you:
1357 video_extension = os.path.splitext(video_id)[1][1:]
1358 video_id = os.path.splitext(video_id)[0]
1360 # it's tempting to parse this further, but you would
1361 # have to take into account all the variations like
1362 # Video Title - Site Name
1363 # Site Name | Video Title
1364 # Video Title - Tagline | Site Name
1365 # and so on and so forth; it's just not practical
1366 mobj = re.search(r'<title>(.*)</title>', webpage)
1368 raise ExtractorError(u'Unable to extract title')
1369 video_title = mobj.group(1)
1371 # video uploader is domain name
1372 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1374 raise ExtractorError(u'Unable to extract title')
1375 video_uploader = mobj.group(1)
1380 'uploader': video_uploader,
1381 'upload_date': None,
1382 'title': video_title,
1383 'ext': video_extension,
1387 class YoutubeSearchIE(InfoExtractor):
1388 """Information Extractor for YouTube search queries."""
1389 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1390 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1391 _max_youtube_results = 1000
1392 IE_NAME = u'youtube:search'
1394 def report_download_page(self, query, pagenum):
1395 """Report attempt to download search page with given number."""
1396 query = query.decode(preferredencoding())
1397 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1399 def _real_extract(self, query):
1400 mobj = re.match(self._VALID_URL, query)
1402 raise ExtractorError(u'Invalid search query "%s"' % query)
1404 prefix, query = query.split(':')
1406 query = query.encode('utf-8')
1408 return self._get_n_results(query, 1)
1409 elif prefix == 'all':
1410 self._get_n_results(query, self._max_youtube_results)
1415 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1416 elif n > self._max_youtube_results:
1417 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1418 n = self._max_youtube_results
1419 return self._get_n_results(query, n)
1420 except ValueError: # parsing prefix as integer fails
1421 return self._get_n_results(query, 1)
1423 def _get_n_results(self, query, n):
1424 """Get a specified number of results for a query"""
1430 while (50 * pagenum) < limit:
1431 self.report_download_page(query, pagenum+1)
1432 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1433 request = compat_urllib_request.Request(result_url)
1435 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1437 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1438 api_response = json.loads(data)['data']
1440 if not 'items' in api_response:
1441 raise ExtractorError(u'[youtube] No video results')
1443 new_ids = list(video['id'] for video in api_response['items'])
1444 video_ids += new_ids
1446 limit = min(n, api_response['totalItems'])
1449 if len(video_ids) > n:
1450 video_ids = video_ids[:n]
1451 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1455 class GoogleSearchIE(InfoExtractor):
1456 """Information Extractor for Google Video search queries."""
1457 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1458 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1459 _max_google_results = 1000
1460 IE_NAME = u'video.google:search'
1462 def _real_extract(self, query):
1463 mobj = re.match(self._VALID_URL, query)
1465 prefix = mobj.group('prefix')
1466 query = mobj.group('query')
1468 return self._get_n_results(query, 1)
1469 elif prefix == 'all':
1470 return self._get_n_results(query, self._max_google_results)
1474 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1475 elif n > self._max_google_results:
1476 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1477 n = self._max_google_results
1478 return self._get_n_results(query, n)
1480 def _get_n_results(self, query, n):
1481 """Get a specified number of results for a query"""
1484 '_type': 'playlist',
1489 for pagenum in itertools.count(1):
1490 result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1491 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1492 note='Downloading result page ' + str(pagenum))
1494 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1497 'url': mobj.group(1)
1499 res['entries'].append(e)
1501 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1504 class YahooSearchIE(InfoExtractor):
1505 """Information Extractor for Yahoo! Video search queries."""
1508 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1509 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1510 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1511 _MORE_PAGES_INDICATOR = r'\s*Next'
1512 _max_yahoo_results = 1000
1513 IE_NAME = u'video.yahoo:search'
1515 def report_download_page(self, query, pagenum):
1516 """Report attempt to download playlist page with given number."""
1517 query = query.decode(preferredencoding())
1518 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1520 def _real_extract(self, query):
1521 mobj = re.match(self._VALID_URL, query)
1523 raise ExtractorError(u'Invalid search query "%s"' % query)
1525 prefix, query = query.split(':')
1527 query = query.encode('utf-8')
1529 self._download_n_results(query, 1)
1531 elif prefix == 'all':
1532 self._download_n_results(query, self._max_yahoo_results)
1538 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1539 elif n > self._max_yahoo_results:
1540 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1541 n = self._max_yahoo_results
1542 self._download_n_results(query, n)
1544 except ValueError: # parsing prefix as integer fails
1545 self._download_n_results(query, 1)
1548 def _download_n_results(self, query, n):
1549 """Downloads a specified number of results for a query"""
1552 already_seen = set()
1556 self.report_download_page(query, pagenum)
1557 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1558 request = compat_urllib_request.Request(result_url)
1560 page = compat_urllib_request.urlopen(request).read()
1561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1564 # Extract video identifiers
1565 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1566 video_id = mobj.group(1)
1567 if video_id not in already_seen:
1568 video_ids.append(video_id)
1569 already_seen.add(video_id)
1570 if len(video_ids) == n:
1571 # Specified n videos reached
1572 for id in video_ids:
1573 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1576 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577 for id in video_ids:
1578 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1581 pagenum = pagenum + 1
1584 class YoutubePlaylistIE(InfoExtractor):
1585 """Information Extractor for YouTube playlists."""
1587 _VALID_URL = r"""(?:
1592 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1593 \? (?:.*?&)*? (?:p|a|list)=
1596 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1599 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1601 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1603 IE_NAME = u'youtube:playlist'
1606 def suitable(cls, url):
1607 """Receives a URL and returns True if suitable for this IE."""
1608 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1610 def _real_extract(self, url):
1611 # Extract playlist id
1612 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1614 raise ExtractorError(u'Invalid URL: %s' % url)
1616 # Download playlist videos from API
1617 playlist_id = mobj.group(1) or mobj.group(2)
1622 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1623 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1626 response = json.loads(page)
1627 except ValueError as err:
1628 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1630 if 'feed' not in response:
1631 raise ExtractorError(u'Got a malformed response from YouTube API')
1632 playlist_title = response['feed']['title']['$t']
1633 if 'entry' not in response['feed']:
1634 # Number of videos is a multiple of self._MAX_RESULTS
1637 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1638 for entry in response['feed']['entry']
1639 if 'content' in entry ]
1641 if len(response['feed']['entry']) < self._MAX_RESULTS:
1645 videos = [v[1] for v in sorted(videos)]
1647 url_results = [self.url_result(url, 'Youtube') for url in videos]
1648 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1651 class YoutubeChannelIE(InfoExtractor):
1652 """Information Extractor for YouTube channels."""
1654 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1655 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1656 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1657 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1658 IE_NAME = u'youtube:channel'
1660 def extract_videos_from_page(self, page):
1662 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1663 if mobj.group(1) not in ids_in_page:
1664 ids_in_page.append(mobj.group(1))
1667 def _real_extract(self, url):
1668 # Extract channel id
1669 mobj = re.match(self._VALID_URL, url)
1671 raise ExtractorError(u'Invalid URL: %s' % url)
1673 # Download channel page
1674 channel_id = mobj.group(1)
1678 url = self._TEMPLATE_URL % (channel_id, pagenum)
1679 page = self._download_webpage(url, channel_id,
1680 u'Downloading page #%s' % pagenum)
1682 # Extract video identifiers
1683 ids_in_page = self.extract_videos_from_page(page)
1684 video_ids.extend(ids_in_page)
1686 # Download any subsequent channel pages using the json-based channel_ajax query
1687 if self._MORE_PAGES_INDICATOR in page:
1689 pagenum = pagenum + 1
1691 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1692 page = self._download_webpage(url, channel_id,
1693 u'Downloading page #%s' % pagenum)
1695 page = json.loads(page)
1697 ids_in_page = self.extract_videos_from_page(page['content_html'])
1698 video_ids.extend(ids_in_page)
1700 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1703 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1705 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1706 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1707 return [self.playlist_result(url_entries, channel_id)]
1710 class YoutubeUserIE(InfoExtractor):
1711 """Information Extractor for YouTube users."""
1713 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1714 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1715 _GDATA_PAGE_SIZE = 50
1716 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1717 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1718 IE_NAME = u'youtube:user'
1720 def _real_extract(self, url):
1722 mobj = re.match(self._VALID_URL, url)
1724 raise ExtractorError(u'Invalid URL: %s' % url)
1726 username = mobj.group(1)
1728 # Download video ids using YouTube Data API. Result size per
1729 # query is limited (currently to 50 videos) so we need to query
1730 # page by page until there are no video ids - it means we got
1737 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1739 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1740 page = self._download_webpage(gdata_url, username,
1741 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1743 # Extract video identifiers
1746 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1747 if mobj.group(1) not in ids_in_page:
1748 ids_in_page.append(mobj.group(1))
1750 video_ids.extend(ids_in_page)
1752 # A little optimization - if current page is not
1753 # "full", ie. does not contain PAGE_SIZE video ids then
1754 # we can assume that this page is the last one - there
1755 # are no more ids on further pages - no need to query
1758 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1763 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1764 url_results = [self.url_result(url, 'Youtube') for url in urls]
1765 return [self.playlist_result(url_results, playlist_title = username)]
1768 class BlipTVUserIE(InfoExtractor):
1769 """Information Extractor for blip.tv users."""
1771 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1773 IE_NAME = u'blip.tv:user'
1775 def _real_extract(self, url):
1777 mobj = re.match(self._VALID_URL, url)
1779 raise ExtractorError(u'Invalid URL: %s' % url)
1781 username = mobj.group(1)
1783 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1785 page = self._download_webpage(url, username, u'Downloading user page')
1786 mobj = re.search(r'data-users-id="([^"]+)"', page)
1787 page_base = page_base % mobj.group(1)
1790 # Download video ids using BlipTV Ajax calls. Result size per
1791 # query is limited (currently to 12 videos) so we need to query
1792 # page by page until there are no video ids - it means we got
1799 url = page_base + "&page=" + str(pagenum)
1800 page = self._download_webpage(url, username,
1801 u'Downloading video ids from page %d' % pagenum)
1803 # Extract video identifiers
1806 for mobj in re.finditer(r'href="/([^"]+)"', page):
1807 if mobj.group(1) not in ids_in_page:
1808 ids_in_page.append(unescapeHTML(mobj.group(1)))
1810 video_ids.extend(ids_in_page)
1812 # A little optimization - if current page is not
1813 # "full", ie. does not contain PAGE_SIZE video ids then
1814 # we can assume that this page is the last one - there
1815 # are no more ids on further pages - no need to query
1818 if len(ids_in_page) < self._PAGE_SIZE:
1823 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1824 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1825 return [self.playlist_result(url_entries, playlist_title = username)]
1828 class DepositFilesIE(InfoExtractor):
1829 """Information extractor for depositfiles.com"""
1831 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1833 def _real_extract(self, url):
1834 file_id = url.split('/')[-1]
1835 # Rebuild url in english locale
1836 url = 'http://depositfiles.com/en/files/' + file_id
1838 # Retrieve file webpage with 'Free download' button pressed
1839 free_download_indication = { 'gateway_result' : '1' }
1840 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1842 self.report_download_webpage(file_id)
1843 webpage = compat_urllib_request.urlopen(request).read()
1844 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1845 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1847 # Search for the real file URL
1848 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1849 if (mobj is None) or (mobj.group(1) is None):
1850 # Try to figure out reason of the error.
1851 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1852 if (mobj is not None) and (mobj.group(1) is not None):
1853 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1854 raise ExtractorError(u'%s' % restriction_message)
1856 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1858 file_url = mobj.group(1)
1859 file_extension = os.path.splitext(file_url)[1][1:]
1861 # Search for file title
1862 mobj = re.search(r'<b title="(.*?)">', webpage)
1864 raise ExtractorError(u'Unable to extract title')
1865 file_title = mobj.group(1).decode('utf-8')
1868 'id': file_id.decode('utf-8'),
1869 'url': file_url.decode('utf-8'),
1871 'upload_date': None,
1872 'title': file_title,
1873 'ext': file_extension.decode('utf-8'),
1877 class FacebookIE(InfoExtractor):
1878 """Information Extractor for Facebook"""
1880 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1881 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1882 _NETRC_MACHINE = 'facebook'
1883 IE_NAME = u'facebook'
1885 def report_login(self):
1886 """Report attempt to log in."""
1887 self.to_screen(u'Logging in')
1889 def _real_initialize(self):
1890 if self._downloader is None:
1895 downloader_params = self._downloader.params
1897 # Attempt to use provided username and password or .netrc data
1898 if downloader_params.get('username', None) is not None:
1899 useremail = downloader_params['username']
1900 password = downloader_params['password']
1901 elif downloader_params.get('usenetrc', False):
1903 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1904 if info is not None:
1908 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1909 except (IOError, netrc.NetrcParseError) as err:
1910 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1913 if useremail is None:
1922 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1925 login_results = compat_urllib_request.urlopen(request).read()
1926 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1927 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1930 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1933 def _real_extract(self, url):
1934 mobj = re.match(self._VALID_URL, url)
1936 raise ExtractorError(u'Invalid URL: %s' % url)
1937 video_id = mobj.group('ID')
1939 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1940 webpage = self._download_webpage(url, video_id)
1942 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1943 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1944 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1946 raise ExtractorError(u'Cannot parse data')
1947 data = dict(json.loads(m.group(1)))
1948 params_raw = compat_urllib_parse.unquote(data['params'])
1949 params = json.loads(params_raw)
1950 video_data = params['video_data'][0]
1951 video_url = video_data.get('hd_src')
1953 video_url = video_data['sd_src']
1955 raise ExtractorError(u'Cannot find video URL')
1956 video_duration = int(video_data['video_duration'])
1957 thumbnail = video_data['thumbnail_src']
1959 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1961 raise ExtractorError(u'Cannot find title in webpage')
1962 video_title = unescapeHTML(m.group(1))
1966 'title': video_title,
1969 'duration': video_duration,
1970 'thumbnail': thumbnail,
1975 class BlipTVIE(InfoExtractor):
1976 """Information extractor for blip.tv"""
1978 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1979 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1980 IE_NAME = u'blip.tv'
1982 def report_direct_download(self, title):
1983 """Report information extraction."""
1984 self.to_screen(u'%s: Direct download detected' % title)
1986 def _real_extract(self, url):
1987 mobj = re.match(self._VALID_URL, url)
1989 raise ExtractorError(u'Invalid URL: %s' % url)
1991 urlp = compat_urllib_parse_urlparse(url)
1992 if urlp.path.startswith('/play/'):
1993 request = compat_urllib_request.Request(url)
1994 response = compat_urllib_request.urlopen(request)
1995 redirecturl = response.geturl()
1996 rurlp = compat_urllib_parse_urlparse(redirecturl)
1997 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1998 url = 'http://blip.tv/a/a-' + file_id
1999 return self._real_extract(url)
2006 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2007 request = compat_urllib_request.Request(json_url)
2008 request.add_header('User-Agent', 'iTunes/10.6.1')
2009 self.report_extraction(mobj.group(1))
2012 urlh = compat_urllib_request.urlopen(request)
2013 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2014 basename = url.split('/')[-1]
2015 title,ext = os.path.splitext(basename)
2016 title = title.decode('UTF-8')
2017 ext = ext.replace('.', '')
2018 self.report_direct_download(title)
2023 'upload_date': None,
2028 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2029 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2030 if info is None: # Regular URL
2032 json_code_bytes = urlh.read()
2033 json_code = json_code_bytes.decode('utf-8')
2034 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2035 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2038 json_data = json.loads(json_code)
2039 if 'Post' in json_data:
2040 data = json_data['Post']
2044 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2045 video_url = data['media']['url']
2046 umobj = re.match(self._URL_EXT, video_url)
2048 raise ValueError('Can not determine filename extension')
2049 ext = umobj.group(1)
2052 'id': data['item_id'],
2054 'uploader': data['display_name'],
2055 'upload_date': upload_date,
2056 'title': data['title'],
2058 'format': data['media']['mimeType'],
2059 'thumbnail': data['thumbnailUrl'],
2060 'description': data['description'],
2061 'player_url': data['embedUrl'],
2062 'user_agent': 'iTunes/10.6.1',
2064 except (ValueError,KeyError) as err:
2065 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2070 class MyVideoIE(InfoExtractor):
2071 """Information Extractor for myvideo.de."""
2073 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2074 IE_NAME = u'myvideo'
2076 def _real_extract(self,url):
2077 mobj = re.match(self._VALID_URL, url)
2079 raise ExtractorError(u'Invalid URL: %s' % url)
2081 video_id = mobj.group(1)
2084 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2085 webpage = self._download_webpage(webpage_url, video_id)
2087 self.report_extraction(video_id)
2088 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2091 raise ExtractorError(u'Unable to extract media URL')
2092 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2094 mobj = re.search('<title>([^<]+)</title>', webpage)
2096 raise ExtractorError(u'Unable to extract title')
2098 video_title = mobj.group(1)
2104 'upload_date': None,
2105 'title': video_title,
2109 class ComedyCentralIE(InfoExtractor):
2110 """Information extractor for The Daily Show and Colbert Report """
2112 # urls can be abbreviations like :thedailyshow or :colbert
2113 # urls for episodes like:
2114 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2115 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2116 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2117 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2118 |(https?://)?(www\.)?
2119 (?P<showname>thedailyshow|colbertnation)\.com/
2120 (full-episodes/(?P<episode>.*)|
2122 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2123 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2126 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2128 _video_extensions = {
2136 _video_dimensions = {
2146 def suitable(cls, url):
2147 """Receives a URL and returns True if suitable for this IE."""
2148 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2150 def _print_formats(self, formats):
2151 print('Available formats:')
2153 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2156 def _real_extract(self, url):
2157 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2159 raise ExtractorError(u'Invalid URL: %s' % url)
2161 if mobj.group('shortname'):
2162 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2163 url = u'http://www.thedailyshow.com/full-episodes/'
2165 url = u'http://www.colbertnation.com/full-episodes/'
2166 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2167 assert mobj is not None
2169 if mobj.group('clip'):
2170 if mobj.group('showname') == 'thedailyshow':
2171 epTitle = mobj.group('tdstitle')
2173 epTitle = mobj.group('cntitle')
2176 dlNewest = not mobj.group('episode')
2178 epTitle = mobj.group('showname')
2180 epTitle = mobj.group('episode')
2182 self.report_extraction(epTitle)
2183 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2185 url = htmlHandle.geturl()
2186 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2188 raise ExtractorError(u'Invalid redirected URL: ' + url)
2189 if mobj.group('episode') == '':
2190 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2191 epTitle = mobj.group('episode')
2193 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2195 if len(mMovieParams) == 0:
2196 # The Colbert Report embeds the information in a without
2197 # a URL prefix; so extract the alternate reference
2198 # and then add the URL prefix manually.
2200 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2201 if len(altMovieParams) == 0:
2202 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2204 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2206 uri = mMovieParams[0][1]
2207 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2208 indexXml = self._download_webpage(indexUrl, epTitle,
2209 u'Downloading show index',
2210 u'unable to download episode index')
2214 idoc = xml.etree.ElementTree.fromstring(indexXml)
2215 itemEls = idoc.findall('.//item')
2216 for partNum,itemEl in enumerate(itemEls):
2217 mediaId = itemEl.findall('./guid')[0].text
2218 shortMediaId = mediaId.split(':')[-1]
2219 showId = mediaId.split(':')[-2].replace('.com', '')
2220 officialTitle = itemEl.findall('./title')[0].text
2221 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2223 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2224 compat_urllib_parse.urlencode({'uri': mediaId}))
2225 configXml = self._download_webpage(configUrl, epTitle,
2226 u'Downloading configuration for %s' % shortMediaId)
2228 cdoc = xml.etree.ElementTree.fromstring(configXml)
2230 for rendition in cdoc.findall('.//rendition'):
2231 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2235 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2238 if self._downloader.params.get('listformats', None):
2239 self._print_formats([i[0] for i in turls])
2242 # For now, just pick the highest bitrate
2243 format,rtmp_video_url = turls[-1]
2245 # Get the format arg from the arg stream
2246 req_format = self._downloader.params.get('format', None)
2248 # Select format if we can find one
2251 format, rtmp_video_url = f, v
2254 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2256 raise ExtractorError(u'Cannot transform RTMP url')
2257 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2258 video_url = base + m.group('finalid')
2260 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2265 'upload_date': officialDate,
2270 'description': officialTitle,
2272 results.append(info)
2277 class EscapistIE(InfoExtractor):
2278 """Information extractor for The Escapist """
2280 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2281 IE_NAME = u'escapist'
2283 def _real_extract(self, url):
2284 mobj = re.match(self._VALID_URL, url)
2286 raise ExtractorError(u'Invalid URL: %s' % url)
2287 showName = mobj.group('showname')
2288 videoId = mobj.group('episode')
2290 self.report_extraction(showName)
2291 webPage = self._download_webpage(url, showName)
2293 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2294 description = unescapeHTML(descMatch.group(1))
2295 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2296 imgUrl = unescapeHTML(imgMatch.group(1))
2297 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2298 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2299 configUrlMatch = re.search('config=(.*)$', playerUrl)
2300 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2302 configJSON = self._download_webpage(configUrl, showName,
2303 u'Downloading configuration',
2304 u'unable to download configuration')
2306 # Technically, it's JavaScript, not JSON
2307 configJSON = configJSON.replace("'", '"')
2310 config = json.loads(configJSON)
2311 except (ValueError,) as err:
2312 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2314 playlist = config['playlist']
2315 videoUrl = playlist[1]['url']
2320 'uploader': showName,
2321 'upload_date': None,
2324 'thumbnail': imgUrl,
2325 'description': description,
2326 'player_url': playerUrl,
2331 class CollegeHumorIE(InfoExtractor):
2332 """Information extractor for collegehumor.com"""
2335 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2336 IE_NAME = u'collegehumor'
2338 def report_manifest(self, video_id):
2339 """Report information extraction."""
2340 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2342 def _real_extract(self, url):
2343 mobj = re.match(self._VALID_URL, url)
2345 raise ExtractorError(u'Invalid URL: %s' % url)
2346 video_id = mobj.group('videoid')
2351 'upload_date': None,
2354 self.report_extraction(video_id)
2355 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2357 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2359 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2361 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2363 videoNode = mdoc.findall('./video')[0]
2364 info['description'] = videoNode.findall('./description')[0].text
2365 info['title'] = videoNode.findall('./caption')[0].text
2366 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2367 manifest_url = videoNode.findall('./file')[0].text
2369 raise ExtractorError(u'Invalid metadata XML file')
2371 manifest_url += '?hdcore=2.10.3'
2372 self.report_manifest(video_id)
2374 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2375 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2376 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2378 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2380 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2381 node_id = media_node.attrib['url']
2382 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2383 except IndexError as err:
2384 raise ExtractorError(u'Invalid manifest file')
2386 url_pr = compat_urllib_parse_urlparse(manifest_url)
2387 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2394 class XVideosIE(InfoExtractor):
2395 """Information extractor for xvideos.com"""
2397 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2398 IE_NAME = u'xvideos'
2400 def _real_extract(self, url):
2401 mobj = re.match(self._VALID_URL, url)
2403 raise ExtractorError(u'Invalid URL: %s' % url)
2404 video_id = mobj.group(1)
2406 webpage = self._download_webpage(url, video_id)
2408 self.report_extraction(video_id)
2412 mobj = re.search(r'flv_url=(.+?)&', webpage)
2414 raise ExtractorError(u'Unable to extract video url')
2415 video_url = compat_urllib_parse.unquote(mobj.group(1))
2419 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2421 raise ExtractorError(u'Unable to extract video title')
2422 video_title = mobj.group(1)
2425 # Extract video thumbnail
2426 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2428 raise ExtractorError(u'Unable to extract video thumbnail')
2429 video_thumbnail = mobj.group(0)
2435 'upload_date': None,
2436 'title': video_title,
2438 'thumbnail': video_thumbnail,
2439 'description': None,
2445 class SoundcloudIE(InfoExtractor):
2446 """Information extractor for soundcloud.com
2447 To access the media, the uid of the song and a stream token
2448 must be extracted from the page source and the script must make
2449 a request to media.soundcloud.com/crossdomain.xml. Then
2450 the media can be grabbed by requesting from an url composed
2451 of the stream token and uid
2454 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2455 IE_NAME = u'soundcloud'
2457 def report_resolve(self, video_id):
2458 """Report information extraction."""
2459 self.to_screen(u'%s: Resolving id' % video_id)
2461 def _real_extract(self, url):
2462 mobj = re.match(self._VALID_URL, url)
2464 raise ExtractorError(u'Invalid URL: %s' % url)
2466 # extract uploader (which is in the url)
2467 uploader = mobj.group(1)
2468 # extract simple title (uploader + slug of song title)
2469 slug_title = mobj.group(2)
2470 simple_title = uploader + u'-' + slug_title
2471 full_title = '%s/%s' % (uploader, slug_title)
2473 self.report_resolve(full_title)
2475 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2476 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2477 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2479 info = json.loads(info_json)
2480 video_id = info['id']
2481 self.report_extraction(full_title)
2483 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2484 stream_json = self._download_webpage(streams_url, full_title,
2485 u'Downloading stream definitions',
2486 u'unable to download stream definitions')
2488 streams = json.loads(stream_json)
2489 mediaURL = streams['http_mp3_128_url']
2490 upload_date = unified_strdate(info['created_at'])
2495 'uploader': info['user']['username'],
2496 'upload_date': upload_date,
2497 'title': info['title'],
2499 'description': info['description'],
2502 class SoundcloudSetIE(InfoExtractor):
2503 """Information extractor for soundcloud.com sets
2504 To access the media, the uid of the song and a stream token
2505 must be extracted from the page source and the script must make
2506 a request to media.soundcloud.com/crossdomain.xml. Then
2507 the media can be grabbed by requesting from an url composed
2508 of the stream token and uid
2511 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2512 IE_NAME = u'soundcloud:set'
2514 def report_resolve(self, video_id):
2515 """Report information extraction."""
2516 self.to_screen(u'%s: Resolving id' % video_id)
2518 def _real_extract(self, url):
2519 mobj = re.match(self._VALID_URL, url)
2521 raise ExtractorError(u'Invalid URL: %s' % url)
2523 # extract uploader (which is in the url)
2524 uploader = mobj.group(1)
2525 # extract simple title (uploader + slug of song title)
2526 slug_title = mobj.group(2)
2527 simple_title = uploader + u'-' + slug_title
2528 full_title = '%s/sets/%s' % (uploader, slug_title)
2530 self.report_resolve(full_title)
2532 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2533 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2534 info_json = self._download_webpage(resolv_url, full_title)
2537 info = json.loads(info_json)
2538 if 'errors' in info:
2539 for err in info['errors']:
2540 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2543 self.report_extraction(full_title)
2544 for track in info['tracks']:
2545 video_id = track['id']
2547 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2548 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2550 self.report_extraction(video_id)
2551 streams = json.loads(stream_json)
2552 mediaURL = streams['http_mp3_128_url']
2557 'uploader': track['user']['username'],
2558 'upload_date': unified_strdate(track['created_at']),
2559 'title': track['title'],
2561 'description': track['description'],
2566 class InfoQIE(InfoExtractor):
2567 """Information extractor for infoq.com"""
2568 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2570 def _real_extract(self, url):
2571 mobj = re.match(self._VALID_URL, url)
2573 raise ExtractorError(u'Invalid URL: %s' % url)
2575 webpage = self._download_webpage(url, video_id=url)
2576 self.report_extraction(url)
2579 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2581 raise ExtractorError(u'Unable to extract video url')
2582 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2583 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2586 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2588 raise ExtractorError(u'Unable to extract video title')
2589 video_title = mobj.group(1)
2591 # Extract description
2592 video_description = u'No description available.'
2593 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2594 if mobj is not None:
2595 video_description = mobj.group(1)
2597 video_filename = video_url.split('/')[-1]
2598 video_id, extension = video_filename.split('.')
2604 'upload_date': None,
2605 'title': video_title,
2606 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2608 'description': video_description,
2613 class MixcloudIE(InfoExtractor):
2614 """Information extractor for www.mixcloud.com"""
2616 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2617 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2618 IE_NAME = u'mixcloud'
2620 def report_download_json(self, file_id):
2621 """Report JSON download."""
2622 self.to_screen(u'Downloading json')
2624 def get_urls(self, jsonData, fmt, bitrate='best'):
2625 """Get urls from 'audio_formats' section in json"""
2628 bitrate_list = jsonData[fmt]
2629 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2630 bitrate = max(bitrate_list) # select highest
2632 url_list = jsonData[fmt][bitrate]
2633 except TypeError: # we have no bitrate info.
2634 url_list = jsonData[fmt]
2637 def check_urls(self, url_list):
2638 """Returns 1st active url from list"""
2639 for url in url_list:
2641 compat_urllib_request.urlopen(url)
2643 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648 def _print_formats(self, formats):
2649 print('Available formats:')
2650 for fmt in formats.keys():
2651 for b in formats[fmt]:
2653 ext = formats[fmt][b][0]
2654 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2655 except TypeError: # we have no bitrate info
2656 ext = formats[fmt][0]
2657 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2660 def _real_extract(self, url):
2661 mobj = re.match(self._VALID_URL, url)
2663 raise ExtractorError(u'Invalid URL: %s' % url)
2664 # extract uploader & filename from url
2665 uploader = mobj.group(1).decode('utf-8')
2666 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2668 # construct API request
2669 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2670 # retrieve .json file with links to files
2671 request = compat_urllib_request.Request(file_url)
2673 self.report_download_json(file_url)
2674 jsonData = compat_urllib_request.urlopen(request).read()
2675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2679 json_data = json.loads(jsonData)
2680 player_url = json_data['player_swf_url']
2681 formats = dict(json_data['audio_formats'])
2683 req_format = self._downloader.params.get('format', None)
2686 if self._downloader.params.get('listformats', None):
2687 self._print_formats(formats)
2690 if req_format is None or req_format == 'best':
2691 for format_param in formats.keys():
2692 url_list = self.get_urls(formats, format_param)
2694 file_url = self.check_urls(url_list)
2695 if file_url is not None:
2698 if req_format not in formats:
2699 raise ExtractorError(u'Format is not available')
2701 url_list = self.get_urls(formats, req_format)
2702 file_url = self.check_urls(url_list)
2703 format_param = req_format
2706 'id': file_id.decode('utf-8'),
2707 'url': file_url.decode('utf-8'),
2708 'uploader': uploader.decode('utf-8'),
2709 'upload_date': None,
2710 'title': json_data['name'],
2711 'ext': file_url.split('.')[-1].decode('utf-8'),
2712 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2713 'thumbnail': json_data['thumbnail_url'],
2714 'description': json_data['description'],
2715 'player_url': player_url.decode('utf-8'),
2718 class StanfordOpenClassroomIE(InfoExtractor):
2719 """Information extractor for Stanford's Open ClassRoom"""
2721 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2722 IE_NAME = u'stanfordoc'
2724 def _real_extract(self, url):
2725 mobj = re.match(self._VALID_URL, url)
2727 raise ExtractorError(u'Invalid URL: %s' % url)
2729 if mobj.group('course') and mobj.group('video'): # A specific video
2730 course = mobj.group('course')
2731 video = mobj.group('video')
2733 'id': course + '_' + video,
2735 'upload_date': None,
2738 self.report_extraction(info['id'])
2739 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2740 xmlUrl = baseUrl + video + '.xml'
2742 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2743 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2744 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2745 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2747 info['title'] = mdoc.findall('./title')[0].text
2748 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2750 raise ExtractorError(u'Invalid metadata XML file')
2751 info['ext'] = info['url'].rpartition('.')[2]
2753 elif mobj.group('course'): # A course page
2754 course = mobj.group('course')
2759 'upload_date': None,
2762 coursepage = self._download_webpage(url, info['id'],
2763 note='Downloading course info page',
2764 errnote='Unable to download course info page')
2766 m = re.search('<h1>([^<]+)</h1>', coursepage)
2768 info['title'] = unescapeHTML(m.group(1))
2770 info['title'] = info['id']
2772 m = re.search('<description>([^<]+)</description>', coursepage)
2774 info['description'] = unescapeHTML(m.group(1))
2776 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2779 'type': 'reference',
2780 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2784 for entry in info['list']:
2785 assert entry['type'] == 'reference'
2786 results += self.extract(entry['url'])
2790 'id': 'Stanford OpenClassroom',
2793 'upload_date': None,
2796 self.report_download_webpage(info['id'])
2797 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2799 rootpage = compat_urllib_request.urlopen(rootURL).read()
2800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2801 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2803 info['title'] = info['id']
2805 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2808 'type': 'reference',
2809 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2814 for entry in info['list']:
2815 assert entry['type'] == 'reference'
2816 results += self.extract(entry['url'])
2819 class MTVIE(InfoExtractor):
2820 """Information extractor for MTV.com"""
2822 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2825 def _real_extract(self, url):
2826 mobj = re.match(self._VALID_URL, url)
2828 raise ExtractorError(u'Invalid URL: %s' % url)
2829 if not mobj.group('proto'):
2830 url = 'http://' + url
2831 video_id = mobj.group('videoid')
2833 webpage = self._download_webpage(url, video_id)
2835 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2837 raise ExtractorError(u'Unable to extract song name')
2838 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2839 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2841 raise ExtractorError(u'Unable to extract performer')
2842 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2843 video_title = performer + ' - ' + song_name
2845 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2847 raise ExtractorError(u'Unable to mtvn_uri')
2848 mtvn_uri = mobj.group(1)
2850 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2852 raise ExtractorError(u'Unable to extract content id')
2853 content_id = mobj.group(1)
2855 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2856 self.report_extraction(video_id)
2857 request = compat_urllib_request.Request(videogen_url)
2859 metadataXml = compat_urllib_request.urlopen(request).read()
2860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2861 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2863 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2864 renditions = mdoc.findall('.//rendition')
2866 # For now, always pick the highest quality.
2867 rendition = renditions[-1]
2870 _,_,ext = rendition.attrib['type'].partition('/')
2871 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2872 video_url = rendition.find('./src').text
2874 raise ExtractorError('Invalid rendition field.')
2879 'uploader': performer,
2880 'upload_date': None,
2881 'title': video_title,
2889 class YoukuIE(InfoExtractor):
2890 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2893 nowTime = int(time.time() * 1000)
2894 random1 = random.randint(1000,1998)
2895 random2 = random.randint(1000,9999)
2897 return "%d%d%d" %(nowTime,random1,random2)
2899 def _get_file_ID_mix_string(self, seed):
2901 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2903 for i in range(len(source)):
2904 seed = (seed * 211 + 30031 ) % 65536
2905 index = math.floor(seed / 65536 * len(source) )
2906 mixed.append(source[int(index)])
2907 source.remove(source[int(index)])
2908 #return ''.join(mixed)
2911 def _get_file_id(self, fileId, seed):
2912 mixed = self._get_file_ID_mix_string(seed)
2913 ids = fileId.split('*')
2917 realId.append(mixed[int(ch)])
2918 return ''.join(realId)
2920 def _real_extract(self, url):
2921 mobj = re.match(self._VALID_URL, url)
2923 raise ExtractorError(u'Invalid URL: %s' % url)
2924 video_id = mobj.group('ID')
2926 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2928 jsondata = self._download_webpage(info_url, video_id)
2930 self.report_extraction(video_id)
2932 config = json.loads(jsondata)
2934 video_title = config['data'][0]['title']
2935 seed = config['data'][0]['seed']
2937 format = self._downloader.params.get('format', None)
2938 supported_format = list(config['data'][0]['streamfileids'].keys())
2940 if format is None or format == 'best':
2941 if 'hd2' in supported_format:
2946 elif format == 'worst':
2954 fileid = config['data'][0]['streamfileids'][format]
2955 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2956 except (UnicodeDecodeError, ValueError, KeyError):
2957 raise ExtractorError(u'Unable to extract info section')
2960 sid = self._gen_sid()
2961 fileid = self._get_file_id(fileid, seed)
2963 #column 8,9 of fileid represent the segment number
2964 #fileid[7:9] should be changed
2965 for index, key in enumerate(keys):
2967 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2968 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2971 'id': '%s_part%02d' % (video_id, index),
2972 'url': download_url,
2974 'upload_date': None,
2975 'title': video_title,
2978 files_info.append(info)
2983 class XNXXIE(InfoExtractor):
2984 """Information extractor for xnxx.com"""
2986 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2988 VIDEO_URL_RE = r'flv_url=(.*?)&'
2989 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2990 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2992 def _real_extract(self, url):
2993 mobj = re.match(self._VALID_URL, url)
2995 raise ExtractorError(u'Invalid URL: %s' % url)
2996 video_id = mobj.group(1)
2998 # Get webpage content
2999 webpage = self._download_webpage(url, video_id)
3001 result = re.search(self.VIDEO_URL_RE, webpage)
3003 raise ExtractorError(u'Unable to extract video url')
3004 video_url = compat_urllib_parse.unquote(result.group(1))
3006 result = re.search(self.VIDEO_TITLE_RE, webpage)
3008 raise ExtractorError(u'Unable to extract video title')
3009 video_title = result.group(1)
3011 result = re.search(self.VIDEO_THUMB_RE, webpage)
3013 raise ExtractorError(u'Unable to extract video thumbnail')
3014 video_thumbnail = result.group(1)
3020 'upload_date': None,
3021 'title': video_title,
3023 'thumbnail': video_thumbnail,
3024 'description': None,
3028 class GooglePlusIE(InfoExtractor):
3029 """Information extractor for plus.google.com."""
3031 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3032 IE_NAME = u'plus.google'
3034 def report_extract_entry(self, url):
3035 """Report downloading extry"""
3036 self.to_screen(u'Downloading entry: %s' % url)
3038 def report_date(self, upload_date):
3039 """Report downloading extry"""
3040 self.to_screen(u'Entry date: %s' % upload_date)
3042 def report_uploader(self, uploader):
3043 """Report downloading extry"""
3044 self.to_screen(u'Uploader: %s' % uploader)
3046 def report_title(self, video_title):
3047 """Report downloading extry"""
3048 self.to_screen(u'Title: %s' % video_title)
3050 def report_extract_vid_page(self, video_page):
3051 """Report information extraction."""
3052 self.to_screen(u'Extracting video page: %s' % video_page)
3054 def _real_extract(self, url):
3055 # Extract id from URL
3056 mobj = re.match(self._VALID_URL, url)
3058 raise ExtractorError(u'Invalid URL: %s' % url)
3060 post_url = mobj.group(0)
3061 video_id = mobj.group(1)
3063 video_extension = 'flv'
3065 # Step 1, Retrieve post webpage to extract further information
3066 self.report_extract_entry(post_url)
3067 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3069 # Extract update date
3071 pattern = 'title="Timestamp">(.*?)</a>'
3072 mobj = re.search(pattern, webpage)
3074 upload_date = mobj.group(1)
3075 # Convert timestring to a format suitable for filename
3076 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3077 upload_date = upload_date.strftime('%Y%m%d')
3078 self.report_date(upload_date)
3082 pattern = r'rel\="author".*?>(.*?)</a>'
3083 mobj = re.search(pattern, webpage)
3085 uploader = mobj.group(1)
3086 self.report_uploader(uploader)
3089 # Get the first line for title
3091 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3092 mobj = re.search(pattern, webpage)
3094 video_title = mobj.group(1)
3095 self.report_title(video_title)
3097 # Step 2, Stimulate clicking the image box to launch video
3098 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3099 mobj = re.search(pattern, webpage)
3101 raise ExtractorError(u'Unable to extract video page URL')
3103 video_page = mobj.group(1)
3104 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3105 self.report_extract_vid_page(video_page)
3108 # Extract video links on video page
3109 """Extract video links of all sizes"""
3110 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3111 mobj = re.findall(pattern, webpage)
3113 raise ExtractorError(u'Unable to extract video links')
3115 # Sort in resolution
3116 links = sorted(mobj)
3118 # Choose the lowest of the sort, i.e. highest resolution
3119 video_url = links[-1]
3120 # Only get the url. The resolution part in the tuple has no use anymore
3121 video_url = video_url[-1]
3122 # Treat escaped \u0026 style hex
3124 video_url = video_url.decode("unicode_escape")
3125 except AttributeError: # Python 3
3126 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3132 'uploader': uploader,
3133 'upload_date': upload_date,
3134 'title': video_title,
3135 'ext': video_extension,
3138 class NBAIE(InfoExtractor):
3139 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3142 def _real_extract(self, url):
3143 mobj = re.match(self._VALID_URL, url)
3145 raise ExtractorError(u'Invalid URL: %s' % url)
3147 video_id = mobj.group(1)
3148 if video_id.endswith('/index.html'):
3149 video_id = video_id[:-len('/index.html')]
3151 webpage = self._download_webpage(url, video_id)
3153 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3154 def _findProp(rexp, default=None):
3155 m = re.search(rexp, webpage)
3157 return unescapeHTML(m.group(1))
3161 shortened_video_id = video_id.rpartition('/')[2]
3162 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3164 'id': shortened_video_id,
3168 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3169 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3173 class JustinTVIE(InfoExtractor):
3174 """Information extractor for justin.tv and twitch.tv"""
3175 # TODO: One broadcast may be split into multiple videos. The key
3176 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3177 # starts at 1 and increases. Can we treat all parts as one video?
3179 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3181 (?P<channelid>[^/]+)|
3182 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3183 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3187 _JUSTIN_PAGE_LIMIT = 100
3188 IE_NAME = u'justin.tv'
3190 def report_download_page(self, channel, offset):
3191 """Report attempt to download a single page of videos."""
3192 self.to_screen(u'%s: Downloading video information from %d to %d' %
3193 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3195 # Return count of items, list of *valid* items
3196 def _parse_page(self, url, video_id):
3197 webpage = self._download_webpage(url, video_id,
3198 u'Downloading video info JSON',
3199 u'unable to download video info JSON')
3201 response = json.loads(webpage)
3202 if type(response) != list:
3203 error_text = response.get('error', 'unknown error')
3204 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3206 for clip in response:
3207 video_url = clip['video_file_url']
3209 video_extension = os.path.splitext(video_url)[1][1:]
3210 video_date = re.sub('-', '', clip['start_time'][:10])
3211 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3212 video_id = clip['id']
3213 video_title = clip.get('title', video_id)
3217 'title': video_title,
3218 'uploader': clip.get('channel_name', video_uploader_id),
3219 'uploader_id': video_uploader_id,
3220 'upload_date': video_date,
3221 'ext': video_extension,
3223 return (len(response), info)
3225 def _real_extract(self, url):
3226 mobj = re.match(self._VALID_URL, url)
3228 raise ExtractorError(u'invalid URL: %s' % url)
3230 api_base = 'http://api.justin.tv'
3232 if mobj.group('channelid'):
3234 video_id = mobj.group('channelid')
3235 api = api_base + '/channel/archives/%s.json' % video_id
3236 elif mobj.group('chapterid'):
3237 chapter_id = mobj.group('chapterid')
3239 webpage = self._download_webpage(url, chapter_id)
3240 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3242 raise ExtractorError(u'Cannot find archive of a chapter')
3243 archive_id = m.group(1)
3245 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3246 chapter_info_xml = self._download_webpage(api, chapter_id,
3247 note=u'Downloading chapter information',
3248 errnote=u'Chapter information download failed')
3249 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3250 for a in doc.findall('.//archive'):
3251 if archive_id == a.find('./id').text:
3254 raise ExtractorError(u'Could not find chapter in chapter information')
3256 video_url = a.find('./video_file_url').text
3257 video_ext = video_url.rpartition('.')[2] or u'flv'
3259 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3260 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3261 note='Downloading chapter metadata',
3262 errnote='Download of chapter metadata failed')
3263 chapter_info = json.loads(chapter_info_json)
3265 bracket_start = int(doc.find('.//bracket_start').text)
3266 bracket_end = int(doc.find('.//bracket_end').text)
3268 # TODO determine start (and probably fix up file)
3269 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3270 #video_url += u'?start=' + TODO:start_timestamp
3271 # bracket_start is 13290, but we want 51670615
3272 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3273 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3276 'id': u'c' + chapter_id,
3279 'title': chapter_info['title'],
3280 'thumbnail': chapter_info['preview'],
3281 'description': chapter_info['description'],
3282 'uploader': chapter_info['channel']['display_name'],
3283 'uploader_id': chapter_info['channel']['name'],
3287 video_id = mobj.group('videoid')
3288 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3290 self.report_extraction(video_id)
3294 limit = self._JUSTIN_PAGE_LIMIT
3297 self.report_download_page(video_id, offset)
3298 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3299 page_count, page_info = self._parse_page(page_url, video_id)
3300 info.extend(page_info)
3301 if not paged or page_count != limit:
3306 class FunnyOrDieIE(InfoExtractor):
3307 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3309 def _real_extract(self, url):
3310 mobj = re.match(self._VALID_URL, url)
3312 raise ExtractorError(u'invalid URL: %s' % url)
3314 video_id = mobj.group('id')
3315 webpage = self._download_webpage(url, video_id)
3317 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3319 raise ExtractorError(u'Unable to find video information')
3320 video_url = unescapeHTML(m.group('url'))
3322 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3324 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3326 raise ExtractorError(u'Cannot find video title')
3327 title = clean_html(m.group('title'))
3329 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3331 desc = unescapeHTML(m.group('desc'))
3340 'description': desc,
3344 class SteamIE(InfoExtractor):
3345 _VALID_URL = r"""http://store\.steampowered\.com/
3347 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3349 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3353 def suitable(cls, url):
3354 """Receives a URL and returns True if suitable for this IE."""
3355 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3357 def _real_extract(self, url):
3358 m = re.match(self._VALID_URL, url, re.VERBOSE)
3359 gameID = m.group('gameID')
3360 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3361 self.report_age_confirmation()
3362 webpage = self._download_webpage(videourl, gameID)
3363 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3365 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3366 mweb = re.finditer(urlRE, webpage)
3367 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3368 titles = re.finditer(namesRE, webpage)
3369 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3370 thumbs = re.finditer(thumbsRE, webpage)
3372 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3373 video_id = vid.group('videoID')
3374 title = vtitle.group('videoName')
3375 video_url = vid.group('videoURL')
3376 video_thumb = thumb.group('thumbnail')
3378 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3383 'title': unescapeHTML(title),
3384 'thumbnail': video_thumb
3387 return [self.playlist_result(videos, gameID, game_title)]
3389 class UstreamIE(InfoExtractor):
3390 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3391 IE_NAME = u'ustream'
3393 def _real_extract(self, url):
3394 m = re.match(self._VALID_URL, url)
3395 video_id = m.group('videoID')
3396 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3397 webpage = self._download_webpage(url, video_id)
3398 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3399 title = m.group('title')
3400 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3401 uploader = m.group('uploader')
3407 'uploader': uploader
3411 class WorldStarHipHopIE(InfoExtractor):
3412 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3413 IE_NAME = u'WorldStarHipHop'
3415 def _real_extract(self, url):
3416 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3418 m = re.match(self._VALID_URL, url)
3419 video_id = m.group('id')
3421 webpage_src = self._download_webpage(url, video_id)
3423 mobj = re.search(_src_url, webpage_src)
3425 if mobj is not None:
3426 video_url = mobj.group(1)
3427 if 'mp4' in video_url:
3432 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3434 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3437 raise ExtractorError(u'Cannot determine title')
3438 title = mobj.group(1)
3440 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3441 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3442 if mobj is not None:
3443 thumbnail = mobj.group(1)
3445 _title = r"""candytitles.*>(.*)</span>"""
3446 mobj = re.search(_title, webpage_src)
3447 if mobj is not None:
3448 title = mobj.group(1)
3455 'thumbnail' : thumbnail,
3460 class RBMARadioIE(InfoExtractor):
3461 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3463 def _real_extract(self, url):
3464 m = re.match(self._VALID_URL, url)
3465 video_id = m.group('videoID')
3467 webpage = self._download_webpage(url, video_id)
3468 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3470 raise ExtractorError(u'Cannot find metadata')
3471 json_data = m.group(1)
3474 data = json.loads(json_data)
3475 except ValueError as e:
3476 raise ExtractorError(u'Invalid JSON: ' + str(e))
3478 video_url = data['akamai_url'] + '&cbr=256'
3479 url_parts = compat_urllib_parse_urlparse(video_url)
3480 video_ext = url_parts.path.rpartition('.')[2]
3485 'title': data['title'],
3486 'description': data.get('teaser_text'),
3487 'location': data.get('country_of_origin'),
3488 'uploader': data.get('host', {}).get('name'),
3489 'uploader_id': data.get('host', {}).get('slug'),
3490 'thumbnail': data.get('image', {}).get('large_url_2x'),
3491 'duration': data.get('duration'),
3496 class YouPornIE(InfoExtractor):
3497 """Information extractor for youporn.com."""
3498 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3500 def _print_formats(self, formats):
3501 """Print all available formats"""
3502 print(u'Available formats:')
3503 print(u'ext\t\tformat')
3504 print(u'---------------------------------')
3505 for format in formats:
3506 print(u'%s\t\t%s' % (format['ext'], format['format']))
3508 def _specific(self, req_format, formats):
3510 if(x["format"]==req_format):
3514 def _real_extract(self, url):
3515 mobj = re.match(self._VALID_URL, url)
3517 raise ExtractorError(u'Invalid URL: %s' % url)
3519 video_id = mobj.group('videoid')
3521 req = compat_urllib_request.Request(url)
3522 req.add_header('Cookie', 'age_verified=1')
3523 webpage = self._download_webpage(req, video_id)
3525 # Get the video title
3526 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3528 raise ExtractorError(u'Unable to extract video title')
3529 video_title = result.group('title').strip()
3531 # Get the video date
3532 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3534 self._downloader.report_warning(u'unable to extract video date')
3537 upload_date = unified_strdate(result.group('date').strip())
3539 # Get the video uploader
3540 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3542 self._downloader.report_warning(u'unable to extract uploader')
3543 video_uploader = None
3545 video_uploader = result.group('uploader').strip()
3546 video_uploader = clean_html( video_uploader )
3548 # Get all of the formats available
3549 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3550 result = re.search(DOWNLOAD_LIST_RE, webpage)
3552 raise ExtractorError(u'Unable to extract download list')
3553 download_list_html = result.group('download_list').strip()
3555 # Get all of the links from the page
3556 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3557 links = re.findall(LINK_RE, download_list_html)
3558 if(len(links) == 0):
3559 raise ExtractorError(u'ERROR: no known formats available for video')
3561 self.to_screen(u'Links found: %d' % len(links))
3566 # A link looks like this:
3567 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3568 # A path looks like this:
3569 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3570 video_url = unescapeHTML( link )
3571 path = compat_urllib_parse_urlparse( video_url ).path
3572 extension = os.path.splitext( path )[1][1:]
3573 format = path.split('/')[4].split('_')[:2]
3576 format = "-".join( format )
3577 title = u'%s-%s-%s' % (video_title, size, bitrate)
3582 'uploader': video_uploader,
3583 'upload_date': upload_date,
3588 'description': None,
3592 if self._downloader.params.get('listformats', None):
3593 self._print_formats(formats)
3596 req_format = self._downloader.params.get('format', None)
3597 self.to_screen(u'Format: %s' % req_format)
3599 if req_format is None or req_format == 'best':
3601 elif req_format == 'worst':
3602 return [formats[-1]]
3603 elif req_format in ('-1', 'all'):
3606 format = self._specific( req_format, formats )
3608 raise ExtractorError(u'Requested format not available')
3613 class PornotubeIE(InfoExtractor):
3614 """Information extractor for pornotube.com."""
3615 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3617 def _real_extract(self, url):
3618 mobj = re.match(self._VALID_URL, url)
3620 raise ExtractorError(u'Invalid URL: %s' % url)
3622 video_id = mobj.group('videoid')
3623 video_title = mobj.group('title')
3625 # Get webpage content
3626 webpage = self._download_webpage(url, video_id)
3629 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3630 result = re.search(VIDEO_URL_RE, webpage)
3632 raise ExtractorError(u'Unable to extract video url')
3633 video_url = compat_urllib_parse.unquote(result.group('url'))
3635 #Get the uploaded date
3636 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3637 result = re.search(VIDEO_UPLOADED_RE, webpage)
3639 raise ExtractorError(u'Unable to extract video title')
3640 upload_date = unified_strdate(result.group('date'))
3642 info = {'id': video_id,
3645 'upload_date': upload_date,
3646 'title': video_title,
3652 class YouJizzIE(InfoExtractor):
3653 """Information extractor for youjizz.com."""
3654 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3656 def _real_extract(self, url):
3657 mobj = re.match(self._VALID_URL, url)
3659 raise ExtractorError(u'Invalid URL: %s' % url)
3661 video_id = mobj.group('videoid')
3663 # Get webpage content
3664 webpage = self._download_webpage(url, video_id)
3666 # Get the video title
3667 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3669 raise ExtractorError(u'ERROR: unable to extract video title')
3670 video_title = result.group('title').strip()
3672 # Get the embed page
3673 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3675 raise ExtractorError(u'ERROR: unable to extract embed page')
3677 embed_page_url = result.group(0).strip()
3678 video_id = result.group('videoid')
3680 webpage = self._download_webpage(embed_page_url, video_id)
3683 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3685 raise ExtractorError(u'ERROR: unable to extract video url')
3686 video_url = result.group('source')
3688 info = {'id': video_id,
3690 'title': video_title,
3693 'player_url': embed_page_url}
3697 class EightTracksIE(InfoExtractor):
3699 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3701 def _real_extract(self, url):
3702 mobj = re.match(self._VALID_URL, url)
3704 raise ExtractorError(u'Invalid URL: %s' % url)
3705 playlist_id = mobj.group('id')
3707 webpage = self._download_webpage(url, playlist_id)
3709 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3711 raise ExtractorError(u'Cannot find trax information')
3712 json_like = m.group(1)
3713 data = json.loads(json_like)
3715 session = str(random.randint(0, 1000000000))
3717 track_count = data['tracks_count']
3718 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3719 next_url = first_url
3721 for i in itertools.count():
3722 api_json = self._download_webpage(next_url, playlist_id,
3723 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3724 errnote=u'Failed to download song information')
3725 api_data = json.loads(api_json)
3726 track_data = api_data[u'set']['track']
3728 'id': track_data['id'],
3729 'url': track_data['track_file_stream_url'],
3730 'title': track_data['performer'] + u' - ' + track_data['name'],
3731 'raw_title': track_data['name'],
3732 'uploader_id': data['user']['login'],
3736 if api_data['set']['at_last_track']:
3738 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3741 class KeekIE(InfoExtractor):
3742 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3745 def _real_extract(self, url):
3746 m = re.match(self._VALID_URL, url)
3747 video_id = m.group('videoID')
3748 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3749 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3750 webpage = self._download_webpage(url, video_id)
3751 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3752 title = unescapeHTML(m.group('title'))
3753 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3754 uploader = clean_html(m.group('uploader'))
3760 'thumbnail': thumbnail,
3761 'uploader': uploader
3765 class TEDIE(InfoExtractor):
3766 _VALID_URL=r'''http://www\.ted\.com/
3768 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3770 ((?P<type_talk>talks)) # We have a simple talk
3772 (/lang/(.*?))? # The url may contain the language
3773 /(?P<name>\w+) # Here goes the name and then ".html"
3777 def suitable(cls, url):
3778 """Receives a URL and returns True if suitable for this IE."""
3779 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3781 def _real_extract(self, url):
3782 m=re.match(self._VALID_URL, url, re.VERBOSE)
3783 if m.group('type_talk'):
3784 return [self._talk_info(url)]
3786 playlist_id=m.group('playlist_id')
3787 name=m.group('name')
3788 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3789 return [self._playlist_videos_info(url,name,playlist_id)]
3791 def _talk_video_link(self,mediaSlug):
3792 '''Returns the video link for that mediaSlug'''
3793 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3795 def _playlist_videos_info(self,url,name,playlist_id=0):
3796 '''Returns the videos of the playlist'''
3798 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3799 ([.\s]*?)data-playlist_item_id="(\d+)"
3800 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3802 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3803 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3804 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3805 m_names=re.finditer(video_name_RE,webpage)
3807 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3808 m_playlist = re.search(playlist_RE, webpage)
3809 playlist_title = m_playlist.group('playlist_title')
3811 playlist_entries = []
3812 for m_video, m_name in zip(m_videos,m_names):
3813 video_id=m_video.group('video_id')
3814 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3815 playlist_entries.append(self.url_result(talk_url, 'TED'))
3816 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3818 def _talk_info(self, url, video_id=0):
3819 """Return the video for the talk in the url"""
3820 m=re.match(self._VALID_URL, url,re.VERBOSE)
3821 videoName=m.group('name')
3822 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3823 # If the url includes the language we get the title translated
3824 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3825 title=re.search(title_RE, webpage).group('title')
3826 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3827 "id":(?P<videoID>[\d]+).*?
3828 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3829 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3830 thumb_match=re.search(thumb_RE,webpage)
3831 info_match=re.search(info_RE,webpage,re.VERBOSE)
3832 video_id=info_match.group('videoID')
3833 mediaSlug=info_match.group('mediaSlug')
3834 video_url=self._talk_video_link(mediaSlug)
3840 'thumbnail': thumb_match.group('thumbnail')
3844 class MySpassIE(InfoExtractor):
3845 _VALID_URL = r'http://www.myspass.de/.*'
3847 def _real_extract(self, url):
3848 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3850 # video id is the last path element of the URL
3851 # usually there is a trailing slash, so also try the second but last
3852 url_path = compat_urllib_parse_urlparse(url).path
3853 url_parent_path, video_id = os.path.split(url_path)
3855 _, video_id = os.path.split(url_parent_path)
3858 metadata_url = META_DATA_URL_TEMPLATE % video_id
3859 metadata_text = self._download_webpage(metadata_url, video_id)
3860 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3862 # extract values from metadata
3863 url_flv_el = metadata.find('url_flv')
3864 if url_flv_el is None:
3865 raise ExtractorError(u'Unable to extract download url')
3866 video_url = url_flv_el.text
3867 extension = os.path.splitext(video_url)[1][1:]
3868 title_el = metadata.find('title')
3869 if title_el is None:
3870 raise ExtractorError(u'Unable to extract title')
3871 title = title_el.text
3872 format_id_el = metadata.find('format_id')
3873 if format_id_el is None:
3876 format = format_id_el.text
3877 description_el = metadata.find('description')
3878 if description_el is not None:
3879 description = description_el.text
3882 imagePreview_el = metadata.find('imagePreview')
3883 if imagePreview_el is not None:
3884 thumbnail = imagePreview_el.text
3893 'thumbnail': thumbnail,
3894 'description': description
3898 class SpiegelIE(InfoExtractor):
3899 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3901 def _real_extract(self, url):
3902 m = re.match(self._VALID_URL, url)
3903 video_id = m.group('videoID')
3905 webpage = self._download_webpage(url, video_id)
3906 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3908 raise ExtractorError(u'Cannot find title')
3909 video_title = unescapeHTML(m.group(1))
3911 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3912 xml_code = self._download_webpage(xml_url, video_id,
3913 note=u'Downloading XML', errnote=u'Failed to download XML')
3915 idoc = xml.etree.ElementTree.fromstring(xml_code)
3916 last_type = idoc[-1]
3917 filename = last_type.findall('./filename')[0].text
3918 duration = float(last_type.findall('./duration')[0].text)
3920 video_url = 'http://video2.spiegel.de/flash/' + filename
3921 video_ext = filename.rpartition('.')[2]
3926 'title': video_title,
3927 'duration': duration,
3931 class LiveLeakIE(InfoExtractor):
3933 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3934 IE_NAME = u'liveleak'
3936 def _real_extract(self, url):
3937 mobj = re.match(self._VALID_URL, url)
3939 raise ExtractorError(u'Invalid URL: %s' % url)
3941 video_id = mobj.group('video_id')
3943 webpage = self._download_webpage(url, video_id)
3945 m = re.search(r'file: "(.*?)",', webpage)
3947 raise ExtractorError(u'Unable to find video url')
3948 video_url = m.group(1)
3950 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3952 raise ExtractorError(u'Cannot find video title')
3953 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3955 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3957 desc = unescapeHTML(m.group('desc'))
3961 m = re.search(r'By:.*?(\w+)</a>', webpage)
3963 uploader = clean_html(m.group(1))
3972 'description': desc,
3973 'uploader': uploader
3978 class ARDIE(InfoExtractor):
3979 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3980 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3981 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3983 def _real_extract(self, url):
3984 # determine video id from url
3985 m = re.match(self._VALID_URL, url)
3987 numid = re.search(r'documentId=([0-9]+)', url)
3989 video_id = numid.group(1)
3991 video_id = m.group('video_id')
3993 # determine title and media streams from webpage
3994 html = self._download_webpage(url, video_id)
3995 title = re.search(self._TITLE, html).group('title')
3996 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3998 assert '"fsk"' in html
3999 raise ExtractorError(u'This video is only available after 8:00 pm')
4001 # choose default media type and highest quality for now
4002 stream = max([s for s in streams if int(s["media_type"]) == 0],
4003 key=lambda s: int(s["quality"]))
4005 # there's two possibilities: RTMP stream or HTTP download
4006 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4007 if stream['rtmp_url']:
4008 self.to_screen(u'RTMP download detected')
4009 assert stream['video_url'].startswith('mp4:')
4010 info["url"] = stream["rtmp_url"]
4011 info["play_path"] = stream['video_url']
4013 assert stream["video_url"].endswith('.mp4')
4014 info["url"] = stream["video_url"]
4017 class TumblrIE(InfoExtractor):
4018 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4020 def _real_extract(self, url):
4021 m_url = re.match(self._VALID_URL, url)
4022 video_id = m_url.group('id')
4023 blog = m_url.group('blog_name')
4025 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4026 webpage = self._download_webpage(url, video_id)
4028 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4029 video = re.search(re_video, webpage)
4031 self.to_screen("No video founded")
4033 video_url = video.group('video_url')
4034 ext = video.group('ext')
4036 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4037 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4039 # The only place where you can get a title, it's not complete,
4040 # but searching in other places doesn't work for all videos
4041 re_title = r'<title>(?P<title>.*?)</title>'
4042 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4044 return [{'id': video_id,
4051 class BandcampIE(InfoExtractor):
4052 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4054 def _real_extract(self, url):
4055 mobj = re.match(self._VALID_URL, url)
4056 title = mobj.group('title')
4057 webpage = self._download_webpage(url, title)
4058 # We get the link to the free download page
4059 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4060 if m_download is None:
4061 raise ExtractorError(u'No free songs founded')
4063 download_link = m_download.group(1)
4064 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4065 webpage, re.MULTILINE|re.DOTALL).group('id')
4067 download_webpage = self._download_webpage(download_link, id,
4068 'Downloading free downloads page')
4069 # We get the dictionary of the track from some javascrip code
4070 info = re.search(r'items: (.*?),$',
4071 download_webpage, re.MULTILINE).group(1)
4072 info = json.loads(info)[0]
4073 # We pick mp3-320 for now, until format selection can be easily implemented.
4074 mp3_info = info[u'downloads'][u'mp3-320']
4075 # If we try to use this url it says the link has expired
4076 initial_url = mp3_info[u'url']
4077 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4078 m_url = re.match(re_url, initial_url)
4079 #We build the url we will use to get the final track url
4080 # This url is build in Bandcamp in the script download_bunde_*.js
4081 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4082 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4083 # If we could correctly generate the .rand field the url would be
4084 #in the "download_url" key
4085 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4087 track_info = {'id':id,
4088 'title' : info[u'title'],
4091 'thumbnail' : info[u'thumb_url'],
4092 'uploader' : info[u'artist']
4097 class RedTubeIE(InfoExtractor):
4098 """Information Extractor for redtube"""
4099 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4101 def _real_extract(self,url):
4102 mobj = re.match(self._VALID_URL, url)
4104 raise ExtractorError(u'Invalid URL: %s' % url)
4106 video_id = mobj.group('id')
4107 video_extension = 'mp4'
4108 webpage = self._download_webpage(url, video_id)
4109 self.report_extraction(video_id)
4110 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4113 raise ExtractorError(u'Unable to extract media URL')
4115 video_url = mobj.group(1)
4116 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4118 raise ExtractorError(u'Unable to extract title')
4119 video_title = mobj.group(1)
4124 'ext': video_extension,
4125 'title': video_title,
4128 class InaIE(InfoExtractor):
4129 """Information Extractor for Ina.fr"""
4130 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4132 def _real_extract(self,url):
4133 mobj = re.match(self._VALID_URL, url)
4135 video_id = mobj.group('id')
4136 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4137 video_extension = 'mp4'
4138 webpage = self._download_webpage(mrss_url, video_id)
4140 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4142 raise ExtractorError(u'Unable to extract media URL')
4143 video_url = mobj.group(1)
4145 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4147 raise ExtractorError(u'Unable to extract title')
4148 video_title = mobj.group(1)
4153 'ext': video_extension,
4154 'title': video_title,
4157 def gen_extractors():
4158 """ Return a list of an instance of every supported extractor.
4159 The order does matter; the first extractor matched is the one handling the URL.
4162 YoutubePlaylistIE(),
4187 StanfordOpenClassroomIE(),
4197 WorldStarHipHopIE(),
4217 def get_info_extractor(ie_name):
4218 """Returns the info extractor class with the given ie_name"""
4219 return globals()[ie_name+'IE']