2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
614 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615 url_data = compat_parse_qs(url_data_str)
616 if 'itag' in url_data and 'url' in url_data:
617 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618 if not 'ratebypass' in url: url += '&ratebypass=yes'
619 url_map[url_data['itag'][0]] = url
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
629 raise ExtractorError(u'no known formats available for video')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
646 video_url_list = [(rf, url_map[rf])]
648 if video_url_list is None:
649 raise ExtractorError(u'requested format not available')
651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
654 for format_param, video_real_url in video_url_list:
656 video_extension = self._video_extensions.get(format_param, 'flv')
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
663 'url': video_real_url,
664 'uploader': video_uploader,
665 'uploader_id': video_uploader_id,
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
679 class MetacafeIE(InfoExtractor):
680 """Information Extractor for metacafe.com."""
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
687 def report_disclaimer(self):
688 """Report disclaimer retrieval."""
689 self.to_screen(u'Retrieving disclaimer')
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
712 def _real_extract(self, url):
713 # Extract id and simplified title from URL
714 mobj = re.match(self._VALID_URL, url)
716 raise ExtractorError(u'Invalid URL: %s' % url)
718 video_id = mobj.group(1)
720 # Check if video comes from YouTube
721 mobj2 = re.match(r'^yt-(.*)$', video_id)
722 if mobj2 is not None:
723 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
725 # Retrieve video webpage to extract further information
726 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
728 # Extract URL, uploader and title from webpage
729 self.report_extraction(video_id)
730 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
732 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733 video_extension = mediaURL[-3:]
735 # Extract gdaKey if available
736 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
740 gdaKey = mobj.group(1)
741 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
743 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
745 raise ExtractorError(u'Unable to extract media URL')
746 vardict = compat_parse_qs(mobj.group(1))
747 if 'mediaData' not in vardict:
748 raise ExtractorError(u'Unable to extract media URL')
749 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
751 raise ExtractorError(u'Unable to extract media URL')
752 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753 video_extension = mediaURL[-3:]
754 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
756 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
758 raise ExtractorError(u'Unable to extract title')
759 video_title = mobj.group(1).decode('utf-8')
761 mobj = re.search(r'submitter=(.*?);', webpage)
763 raise ExtractorError(u'Unable to extract uploader nickname')
764 video_uploader = mobj.group(1)
767 'id': video_id.decode('utf-8'),
768 'url': video_url.decode('utf-8'),
769 'uploader': video_uploader.decode('utf-8'),
771 'title': video_title,
772 'ext': video_extension.decode('utf-8'),
775 class DailymotionIE(InfoExtractor):
776 """Information Extractor for Dailymotion"""
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
781 def _real_extract(self, url):
782 # Extract id and simplified title from URL
783 mobj = re.match(self._VALID_URL, url)
785 raise ExtractorError(u'Invalid URL: %s' % url)
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
789 video_extension = 'mp4'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
794 webpage = self._download_webpage(request, video_id)
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
800 raise ExtractorError(u'Unable to extract media URL')
801 flashvars = compat_urllib_parse.unquote(mobj.group(1))
803 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
806 self.to_screen(u'Using %s' % key)
809 raise ExtractorError(u'Unable to extract video URL')
811 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
813 raise ExtractorError(u'Unable to extract video URL')
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
817 # TODO: support choosing qualities
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
821 raise ExtractorError(u'Unable to extract title')
822 video_title = unescapeHTML(mobj.group('title'))
824 video_uploader = None
825 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
827 # lookin for official user
828 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829 if mobj_official is None:
830 self._downloader.report_warning(u'unable to extract uploader nickname')
832 video_uploader = mobj_official.group(1)
834 video_uploader = mobj.group(1)
836 video_upload_date = None
837 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
839 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
844 'uploader': video_uploader,
845 'upload_date': video_upload_date,
846 'title': video_title,
847 'ext': video_extension,
851 class PhotobucketIE(InfoExtractor):
852 """Information extractor for photobucket.com."""
854 # TODO: the original _VALID_URL was:
855 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 # Check if it's necessary to keep the old extracion process
857 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858 IE_NAME = u'photobucket'
860 def _real_extract(self, url):
861 # Extract id from URL
862 mobj = re.match(self._VALID_URL, url)
864 raise ExtractorError(u'Invalid URL: %s' % url)
866 video_id = mobj.group('id')
868 video_extension = mobj.group('ext')
870 # Retrieve video webpage to extract further information
871 webpage = self._download_webpage(url, video_id)
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
875 # We try first by looking the javascript code:
876 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
878 info = json.loads(mobj.group('json'))
881 'url': info[u'downloadUrl'],
882 'uploader': info[u'username'],
883 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884 'title': info[u'title'],
885 'ext': video_extension,
886 'thumbnail': info[u'thumbUrl'],
889 # We try looking in other parts of the webpage
890 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
892 raise ExtractorError(u'Unable to extract media URL')
893 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
897 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
899 raise ExtractorError(u'Unable to extract title')
900 video_title = mobj.group(1).decode('utf-8')
902 video_uploader = mobj.group(2).decode('utf-8')
905 'id': video_id.decode('utf-8'),
906 'url': video_url.decode('utf-8'),
907 'uploader': video_uploader,
909 'title': video_title,
910 'ext': video_extension.decode('utf-8'),
914 class YahooIE(InfoExtractor):
915 """Information extractor for video.yahoo.com."""
918 # _VALID_URL matches all Yahoo! Video URLs
919 # _VPAGE_URL matches only the extractable '/watch/' URLs
920 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
921 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
922 IE_NAME = u'video.yahoo'
924 def _real_extract(self, url, new_video=True):
925 # Extract ID from URL
926 mobj = re.match(self._VALID_URL, url)
928 raise ExtractorError(u'Invalid URL: %s' % url)
930 video_id = mobj.group(2)
931 video_extension = 'flv'
933 # Rewrite valid but non-extractable URLs as
934 # extractable English language /watch/ URLs
935 if re.match(self._VPAGE_URL, url) is None:
936 request = compat_urllib_request.Request(url)
938 webpage = compat_urllib_request.urlopen(request).read()
939 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
940 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
942 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
944 raise ExtractorError(u'Unable to extract id field')
945 yahoo_id = mobj.group(1)
947 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
949 raise ExtractorError(u'Unable to extract vid field')
950 yahoo_vid = mobj.group(1)
952 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
953 return self._real_extract(url, new_video=False)
955 # Retrieve video webpage to extract further information
956 request = compat_urllib_request.Request(url)
958 self.report_download_webpage(video_id)
959 webpage = compat_urllib_request.urlopen(request).read()
960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
961 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
963 # Extract uploader and title from webpage
964 self.report_extraction(video_id)
965 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
967 raise ExtractorError(u'Unable to extract video title')
968 video_title = mobj.group(1).decode('utf-8')
970 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
972 raise ExtractorError(u'Unable to extract video uploader')
973 video_uploader = mobj.group(1).decode('utf-8')
975 # Extract video thumbnail
976 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
978 raise ExtractorError(u'Unable to extract video thumbnail')
979 video_thumbnail = mobj.group(1).decode('utf-8')
981 # Extract video description
982 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
984 raise ExtractorError(u'Unable to extract video description')
985 video_description = mobj.group(1).decode('utf-8')
986 if not video_description:
987 video_description = 'No description available.'
989 # Extract video height and width
990 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
992 raise ExtractorError(u'Unable to extract video height')
993 yv_video_height = mobj.group(1)
995 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
997 raise ExtractorError(u'Unable to extract video width')
998 yv_video_width = mobj.group(1)
1000 # Retrieve video playlist to extract media URL
1001 # I'm not completely sure what all these options are, but we
1002 # seem to need most of them, otherwise the server sends a 401.
1003 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1004 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1005 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1006 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1007 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1009 self.report_download_webpage(video_id)
1010 webpage = compat_urllib_request.urlopen(request).read()
1011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1012 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1014 # Extract media URL from playlist XML
1015 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1017 raise ExtractorError(u'Unable to extract media URL')
1018 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1019 video_url = unescapeHTML(video_url)
1022 'id': video_id.decode('utf-8'),
1024 'uploader': video_uploader,
1025 'upload_date': None,
1026 'title': video_title,
1027 'ext': video_extension.decode('utf-8'),
1028 'thumbnail': video_thumbnail.decode('utf-8'),
1029 'description': video_description,
1033 class VimeoIE(InfoExtractor):
1034 """Information extractor for vimeo.com."""
1036 # _VALID_URL matches Vimeo URLs
1037 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1040 def _real_extract(self, url, new_video=True):
1041 # Extract ID from URL
1042 mobj = re.match(self._VALID_URL, url)
1044 raise ExtractorError(u'Invalid URL: %s' % url)
1046 video_id = mobj.group('id')
1047 if not mobj.group('proto'):
1048 url = 'https://' + url
1049 if mobj.group('direct_link'):
1050 url = 'https://vimeo.com/' + video_id
1052 # Retrieve video webpage to extract further information
1053 request = compat_urllib_request.Request(url, None, std_headers)
1054 webpage = self._download_webpage(request, video_id)
1056 # Now we begin extracting as much information as we can from what we
1057 # retrieved. First we extract the information common to all extractors,
1058 # and latter we extract those that are Vimeo specific.
1059 self.report_extraction(video_id)
1061 # Extract the config JSON
1063 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1064 config = json.loads(config)
1066 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1067 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1069 raise ExtractorError(u'Unable to extract info section')
1072 video_title = config["video"]["title"]
1074 # Extract uploader and uploader_id
1075 video_uploader = config["video"]["owner"]["name"]
1076 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1078 # Extract video thumbnail
1079 video_thumbnail = config["video"]["thumbnail"]
1081 # Extract video description
1082 video_description = get_element_by_attribute("itemprop", "description", webpage)
1083 if video_description: video_description = clean_html(video_description)
1084 else: video_description = u''
1086 # Extract upload date
1087 video_upload_date = None
1088 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089 if mobj is not None:
1090 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1092 # Vimeo specific: extract request signature and timestamp
1093 sig = config['request']['signature']
1094 timestamp = config['request']['timestamp']
1096 # Vimeo specific: extract video codec and quality information
1097 # First consider quality, then codecs, then take everything
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 files = { 'hd': [], 'sd': [], 'other': []}
1101 for codec_name, codec_extension in codecs:
1102 if codec_name in config["video"]["files"]:
1103 if 'hd' in config["video"]["files"][codec_name]:
1104 files['hd'].append((codec_name, codec_extension, 'hd'))
1105 elif 'sd' in config["video"]["files"][codec_name]:
1106 files['sd'].append((codec_name, codec_extension, 'sd'))
1108 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1110 for quality in ('hd', 'sd', 'other'):
1111 if len(files[quality]) > 0:
1112 video_quality = files[quality][0][2]
1113 video_codec = files[quality][0][0]
1114 video_extension = files[quality][0][1]
1115 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1118 raise ExtractorError(u'No known codec found')
1120 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1121 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1126 'uploader': video_uploader,
1127 'uploader_id': video_uploader_id,
1128 'upload_date': video_upload_date,
1129 'title': video_title,
1130 'ext': video_extension,
1131 'thumbnail': video_thumbnail,
1132 'description': video_description,
1136 class ArteTvIE(InfoExtractor):
1137 """arte.tv information extractor."""
1139 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1140 _LIVE_URL = r'index-[0-9]+\.html$'
1142 IE_NAME = u'arte.tv'
1144 def fetch_webpage(self, url):
1145 request = compat_urllib_request.Request(url)
1147 self.report_download_webpage(url)
1148 webpage = compat_urllib_request.urlopen(request).read()
1149 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1150 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1151 except ValueError as err:
1152 raise ExtractorError(u'Invalid URL: %s' % url)
1155 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1156 page = self.fetch_webpage(url)
1157 mobj = re.search(regex, page, regexFlags)
1161 raise ExtractorError(u'Invalid URL: %s' % url)
1163 for (i, key, err) in matchTuples:
1164 if mobj.group(i) is None:
1165 raise ExtractorError(err)
1167 info[key] = mobj.group(i)
1171 def extractLiveStream(self, url):
1172 video_lang = url.split('/')[-4]
1173 info = self.grep_webpage(
1175 r'src="(.*?/videothek_js.*?\.js)',
1178 (1, 'url', u'Invalid URL: %s' % url)
1181 http_host = url.split('/')[2]
1182 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1183 info = self.grep_webpage(
1185 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1186 '(http://.*?\.swf).*?' +
1190 (1, 'path', u'could not extract video path: %s' % url),
1191 (2, 'player', u'could not extract video player: %s' % url),
1192 (3, 'url', u'could not extract video url: %s' % url)
1195 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1197 def extractPlus7Stream(self, url):
1198 video_lang = url.split('/')[-3]
1199 info = self.grep_webpage(
1201 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1204 (1, 'url', u'Invalid URL: %s' % url)
1207 next_url = compat_urllib_parse.unquote(info.get('url'))
1208 info = self.grep_webpage(
1210 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1213 (1, 'url', u'Could not find <video> tag: %s' % url)
1216 next_url = compat_urllib_parse.unquote(info.get('url'))
1218 info = self.grep_webpage(
1220 r'<video id="(.*?)".*?>.*?' +
1221 '<name>(.*?)</name>.*?' +
1222 '<dateVideo>(.*?)</dateVideo>.*?' +
1223 '<url quality="hd">(.*?)</url>',
1226 (1, 'id', u'could not extract video id: %s' % url),
1227 (2, 'title', u'could not extract video title: %s' % url),
1228 (3, 'date', u'could not extract video date: %s' % url),
1229 (4, 'url', u'could not extract video url: %s' % url)
1234 'id': info.get('id'),
1235 'url': compat_urllib_parse.unquote(info.get('url')),
1236 'uploader': u'arte.tv',
1237 'upload_date': unified_strdate(info.get('date')),
1238 'title': info.get('title').decode('utf-8'),
1244 def _real_extract(self, url):
1245 video_id = url.split('/')[-1]
1246 self.report_extraction(video_id)
1248 if re.search(self._LIVE_URL, video_id) is not None:
1249 self.extractLiveStream(url)
1252 info = self.extractPlus7Stream(url)
1257 class GenericIE(InfoExtractor):
1258 """Generic last-resort information extractor."""
1261 IE_NAME = u'generic'
1263 def report_download_webpage(self, video_id):
1264 """Report webpage download."""
1265 if not self._downloader.params.get('test', False):
1266 self._downloader.report_warning(u'Falling back on generic information extractor.')
1267 super(GenericIE, self).report_download_webpage(video_id)
1269 def report_following_redirect(self, new_url):
1270 """Report information extraction."""
1271 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1273 def _test_redirect(self, url):
1274 """Check if it is a redirect, like url shorteners, in case return the new url."""
1275 class HeadRequest(compat_urllib_request.Request):
1276 def get_method(self):
1279 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1281 Subclass the HTTPRedirectHandler to make it use our
1282 HeadRequest also on the redirected URL
1284 def redirect_request(self, req, fp, code, msg, headers, newurl):
1285 if code in (301, 302, 303, 307):
1286 newurl = newurl.replace(' ', '%20')
1287 newheaders = dict((k,v) for k,v in req.headers.items()
1288 if k.lower() not in ("content-length", "content-type"))
1289 return HeadRequest(newurl,
1291 origin_req_host=req.get_origin_req_host(),
1294 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1296 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1298 Fallback to GET if HEAD is not allowed (405 HTTP error)
1300 def http_error_405(self, req, fp, code, msg, headers):
1304 newheaders = dict((k,v) for k,v in req.headers.items()
1305 if k.lower() not in ("content-length", "content-type"))
1306 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1308 origin_req_host=req.get_origin_req_host(),
1312 opener = compat_urllib_request.OpenerDirector()
1313 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1314 HTTPMethodFallback, HEADRedirectHandler,
1315 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1316 opener.add_handler(handler())
1318 response = opener.open(HeadRequest(url))
1319 new_url = response.geturl()
1324 self.report_following_redirect(new_url)
1327 def _real_extract(self, url):
1328 new_url = self._test_redirect(url)
1329 if new_url: return [self.url_result(new_url)]
1331 video_id = url.split('/')[-1]
1333 webpage = self._download_webpage(url, video_id)
1334 except ValueError as err:
1335 # since this is the last-resort InfoExtractor, if
1336 # this error is thrown, it'll be thrown here
1337 raise ExtractorError(u'Invalid URL: %s' % url)
1339 self.report_extraction(video_id)
1340 # Start with something easy: JW Player in SWFObject
1341 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1343 # Broaden the search a little bit
1344 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1346 # Broaden the search a little bit: JWPlayer JS loader
1347 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1349 raise ExtractorError(u'Invalid URL: %s' % url)
1351 # It's possible that one of the regexes
1352 # matched, but returned an empty group:
1353 if mobj.group(1) is None:
1354 raise ExtractorError(u'Invalid URL: %s' % url)
1356 video_url = compat_urllib_parse.unquote(mobj.group(1))
1357 video_id = os.path.basename(video_url)
1359 # here's a fun little line of code for you:
1360 video_extension = os.path.splitext(video_id)[1][1:]
1361 video_id = os.path.splitext(video_id)[0]
1363 # it's tempting to parse this further, but you would
1364 # have to take into account all the variations like
1365 # Video Title - Site Name
1366 # Site Name | Video Title
1367 # Video Title - Tagline | Site Name
1368 # and so on and so forth; it's just not practical
1369 mobj = re.search(r'<title>(.*)</title>', webpage)
1371 raise ExtractorError(u'Unable to extract title')
1372 video_title = mobj.group(1)
1374 # video uploader is domain name
1375 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1377 raise ExtractorError(u'Unable to extract title')
1378 video_uploader = mobj.group(1)
1383 'uploader': video_uploader,
1384 'upload_date': None,
1385 'title': video_title,
1386 'ext': video_extension,
1390 class YoutubeSearchIE(InfoExtractor):
1391 """Information Extractor for YouTube search queries."""
1392 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1393 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1394 _max_youtube_results = 1000
1395 IE_NAME = u'youtube:search'
1397 def report_download_page(self, query, pagenum):
1398 """Report attempt to download search page with given number."""
1399 query = query.decode(preferredencoding())
1400 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1402 def _real_extract(self, query):
1403 mobj = re.match(self._VALID_URL, query)
1405 raise ExtractorError(u'Invalid search query "%s"' % query)
1407 prefix, query = query.split(':')
1409 query = query.encode('utf-8')
1411 return self._get_n_results(query, 1)
1412 elif prefix == 'all':
1413 self._get_n_results(query, self._max_youtube_results)
1418 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1419 elif n > self._max_youtube_results:
1420 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1421 n = self._max_youtube_results
1422 return self._get_n_results(query, n)
1423 except ValueError: # parsing prefix as integer fails
1424 return self._get_n_results(query, 1)
1426 def _get_n_results(self, query, n):
1427 """Get a specified number of results for a query"""
1433 while (50 * pagenum) < limit:
1434 self.report_download_page(query, pagenum+1)
1435 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1436 request = compat_urllib_request.Request(result_url)
1438 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1439 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1440 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1441 api_response = json.loads(data)['data']
1443 if not 'items' in api_response:
1444 raise ExtractorError(u'[youtube] No video results')
1446 new_ids = list(video['id'] for video in api_response['items'])
1447 video_ids += new_ids
1449 limit = min(n, api_response['totalItems'])
1452 if len(video_ids) > n:
1453 video_ids = video_ids[:n]
1454 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1458 class GoogleSearchIE(InfoExtractor):
1459 """Information Extractor for Google Video search queries."""
1460 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1461 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1462 _max_google_results = 1000
1463 IE_NAME = u'video.google:search'
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1468 prefix = mobj.group('prefix')
1469 query = mobj.group('query')
1471 return self._get_n_results(query, 1)
1472 elif prefix == 'all':
1473 return self._get_n_results(query, self._max_google_results)
1477 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1478 elif n > self._max_google_results:
1479 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1480 n = self._max_google_results
1481 return self._get_n_results(query, n)
1483 def _get_n_results(self, query, n):
1484 """Get a specified number of results for a query"""
1487 '_type': 'playlist',
1492 for pagenum in itertools.count(1):
1493 result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1494 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1495 note='Downloading result page ' + str(pagenum))
1497 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1500 'url': mobj.group(1)
1502 res['entries'].append(e)
1504 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1507 class YahooSearchIE(InfoExtractor):
1508 """Information Extractor for Yahoo! Video search queries."""
1511 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1512 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1513 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1514 _MORE_PAGES_INDICATOR = r'\s*Next'
1515 _max_yahoo_results = 1000
1516 IE_NAME = u'video.yahoo:search'
1518 def report_download_page(self, query, pagenum):
1519 """Report attempt to download playlist page with given number."""
1520 query = query.decode(preferredencoding())
1521 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1523 def _real_extract(self, query):
1524 mobj = re.match(self._VALID_URL, query)
1526 raise ExtractorError(u'Invalid search query "%s"' % query)
1528 prefix, query = query.split(':')
1530 query = query.encode('utf-8')
1532 self._download_n_results(query, 1)
1534 elif prefix == 'all':
1535 self._download_n_results(query, self._max_yahoo_results)
1541 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1542 elif n > self._max_yahoo_results:
1543 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1544 n = self._max_yahoo_results
1545 self._download_n_results(query, n)
1547 except ValueError: # parsing prefix as integer fails
1548 self._download_n_results(query, 1)
1551 def _download_n_results(self, query, n):
1552 """Downloads a specified number of results for a query"""
1555 already_seen = set()
1559 self.report_download_page(query, pagenum)
1560 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1561 request = compat_urllib_request.Request(result_url)
1563 page = compat_urllib_request.urlopen(request).read()
1564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1565 raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err))
1567 # Extract video identifiers
1568 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1569 video_id = mobj.group(1)
1570 if video_id not in already_seen:
1571 video_ids.append(video_id)
1572 already_seen.add(video_id)
1573 if len(video_ids) == n:
1574 # Specified n videos reached
1575 for id in video_ids:
1576 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1579 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1580 for id in video_ids:
1581 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1584 pagenum = pagenum + 1
1587 class YoutubePlaylistIE(InfoExtractor):
1588 """Information Extractor for YouTube playlists."""
1590 _VALID_URL = r"""(?:
1595 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1596 \? (?:.*?&)*? (?:p|a|list)=
1599 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1602 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1604 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1606 IE_NAME = u'youtube:playlist'
1609 def suitable(cls, url):
1610 """Receives a URL and returns True if suitable for this IE."""
1611 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1613 def _real_extract(self, url):
1614 # Extract playlist id
1615 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1617 raise ExtractorError(u'Invalid URL: %s' % url)
1619 # Download playlist videos from API
1620 playlist_id = mobj.group(1) or mobj.group(2)
1625 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1626 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1629 response = json.loads(page)
1630 except ValueError as err:
1631 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1633 if 'feed' not in response:
1634 raise ExtractorError(u'Got a malformed response from YouTube API')
1635 playlist_title = response['feed']['title']['$t']
1636 if 'entry' not in response['feed']:
1637 # Number of videos is a multiple of self._MAX_RESULTS
1640 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1641 for entry in response['feed']['entry']
1642 if 'content' in entry ]
1644 if len(response['feed']['entry']) < self._MAX_RESULTS:
1648 videos = [v[1] for v in sorted(videos)]
1650 url_results = [self.url_result(url, 'Youtube') for url in videos]
1651 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1654 class YoutubeChannelIE(InfoExtractor):
1655 """Information Extractor for YouTube channels."""
1657 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1658 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1659 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1660 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1661 IE_NAME = u'youtube:channel'
1663 def extract_videos_from_page(self, page):
1665 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1666 if mobj.group(1) not in ids_in_page:
1667 ids_in_page.append(mobj.group(1))
1670 def _real_extract(self, url):
1671 # Extract channel id
1672 mobj = re.match(self._VALID_URL, url)
1674 raise ExtractorError(u'Invalid URL: %s' % url)
1676 # Download channel page
1677 channel_id = mobj.group(1)
1681 url = self._TEMPLATE_URL % (channel_id, pagenum)
1682 page = self._download_webpage(url, channel_id,
1683 u'Downloading page #%s' % pagenum)
1685 # Extract video identifiers
1686 ids_in_page = self.extract_videos_from_page(page)
1687 video_ids.extend(ids_in_page)
1689 # Download any subsequent channel pages using the json-based channel_ajax query
1690 if self._MORE_PAGES_INDICATOR in page:
1692 pagenum = pagenum + 1
1694 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1695 page = self._download_webpage(url, channel_id,
1696 u'Downloading page #%s' % pagenum)
1698 page = json.loads(page)
1700 ids_in_page = self.extract_videos_from_page(page['content_html'])
1701 video_ids.extend(ids_in_page)
1703 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1706 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1708 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1709 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1710 return [self.playlist_result(url_entries, channel_id)]
1713 class YoutubeUserIE(InfoExtractor):
1714 """Information Extractor for YouTube users."""
1716 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1717 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1718 _GDATA_PAGE_SIZE = 50
1719 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1720 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1721 IE_NAME = u'youtube:user'
1723 def _real_extract(self, url):
1725 mobj = re.match(self._VALID_URL, url)
1727 raise ExtractorError(u'Invalid URL: %s' % url)
1729 username = mobj.group(1)
1731 # Download video ids using YouTube Data API. Result size per
1732 # query is limited (currently to 50 videos) so we need to query
1733 # page by page until there are no video ids - it means we got
1740 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1742 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1743 page = self._download_webpage(gdata_url, username,
1744 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1746 # Extract video identifiers
1749 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1750 if mobj.group(1) not in ids_in_page:
1751 ids_in_page.append(mobj.group(1))
1753 video_ids.extend(ids_in_page)
1755 # A little optimization - if current page is not
1756 # "full", ie. does not contain PAGE_SIZE video ids then
1757 # we can assume that this page is the last one - there
1758 # are no more ids on further pages - no need to query
1761 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1766 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1767 url_results = [self.url_result(url, 'Youtube') for url in urls]
1768 return [self.playlist_result(url_results, playlist_title = username)]
1771 class BlipTVUserIE(InfoExtractor):
1772 """Information Extractor for blip.tv users."""
1774 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1776 IE_NAME = u'blip.tv:user'
1778 def _real_extract(self, url):
1780 mobj = re.match(self._VALID_URL, url)
1782 raise ExtractorError(u'Invalid URL: %s' % url)
1784 username = mobj.group(1)
1786 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1788 page = self._download_webpage(url, username, u'Downloading user page')
1789 mobj = re.search(r'data-users-id="([^"]+)"', page)
1790 page_base = page_base % mobj.group(1)
1793 # Download video ids using BlipTV Ajax calls. Result size per
1794 # query is limited (currently to 12 videos) so we need to query
1795 # page by page until there are no video ids - it means we got
1802 url = page_base + "&page=" + str(pagenum)
1803 page = self._download_webpage(url, username,
1804 u'Downloading video ids from page %d' % pagenum)
1806 # Extract video identifiers
1809 for mobj in re.finditer(r'href="/([^"]+)"', page):
1810 if mobj.group(1) not in ids_in_page:
1811 ids_in_page.append(unescapeHTML(mobj.group(1)))
1813 video_ids.extend(ids_in_page)
1815 # A little optimization - if current page is not
1816 # "full", ie. does not contain PAGE_SIZE video ids then
1817 # we can assume that this page is the last one - there
1818 # are no more ids on further pages - no need to query
1821 if len(ids_in_page) < self._PAGE_SIZE:
1826 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1827 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1828 return [self.playlist_result(url_entries, playlist_title = username)]
1831 class DepositFilesIE(InfoExtractor):
1832 """Information extractor for depositfiles.com"""
1834 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1836 def _real_extract(self, url):
1837 file_id = url.split('/')[-1]
1838 # Rebuild url in english locale
1839 url = 'http://depositfiles.com/en/files/' + file_id
1841 # Retrieve file webpage with 'Free download' button pressed
1842 free_download_indication = { 'gateway_result' : '1' }
1843 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1845 self.report_download_webpage(file_id)
1846 webpage = compat_urllib_request.urlopen(request).read()
1847 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1850 # Search for the real file URL
1851 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1852 if (mobj is None) or (mobj.group(1) is None):
1853 # Try to figure out reason of the error.
1854 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1855 if (mobj is not None) and (mobj.group(1) is not None):
1856 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1857 raise ExtractorError(u'%s' % restriction_message)
1859 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1861 file_url = mobj.group(1)
1862 file_extension = os.path.splitext(file_url)[1][1:]
1864 # Search for file title
1865 mobj = re.search(r'<b title="(.*?)">', webpage)
1867 raise ExtractorError(u'Unable to extract title')
1868 file_title = mobj.group(1).decode('utf-8')
1871 'id': file_id.decode('utf-8'),
1872 'url': file_url.decode('utf-8'),
1874 'upload_date': None,
1875 'title': file_title,
1876 'ext': file_extension.decode('utf-8'),
1880 class FacebookIE(InfoExtractor):
1881 """Information Extractor for Facebook"""
1883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1884 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1885 _NETRC_MACHINE = 'facebook'
1886 IE_NAME = u'facebook'
1888 def report_login(self):
1889 """Report attempt to log in."""
1890 self.to_screen(u'Logging in')
1892 def _real_initialize(self):
1893 if self._downloader is None:
1898 downloader_params = self._downloader.params
1900 # Attempt to use provided username and password or .netrc data
1901 if downloader_params.get('username', None) is not None:
1902 useremail = downloader_params['username']
1903 password = downloader_params['password']
1904 elif downloader_params.get('usenetrc', False):
1906 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1907 if info is not None:
1911 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1912 except (IOError, netrc.NetrcParseError) as err:
1913 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1916 if useremail is None:
1925 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1928 login_results = compat_urllib_request.urlopen(request).read()
1929 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1930 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1936 def _real_extract(self, url):
1937 mobj = re.match(self._VALID_URL, url)
1939 raise ExtractorError(u'Invalid URL: %s' % url)
1940 video_id = mobj.group('ID')
1942 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1943 webpage = self._download_webpage(url, video_id)
1945 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1946 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1947 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1949 raise ExtractorError(u'Cannot parse data')
1950 data = dict(json.loads(m.group(1)))
1951 params_raw = compat_urllib_parse.unquote(data['params'])
1952 params = json.loads(params_raw)
1953 video_data = params['video_data'][0]
1954 video_url = video_data.get('hd_src')
1956 video_url = video_data['sd_src']
1958 raise ExtractorError(u'Cannot find video URL')
1959 video_duration = int(video_data['video_duration'])
1960 thumbnail = video_data['thumbnail_src']
1962 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1964 raise ExtractorError(u'Cannot find title in webpage')
1965 video_title = unescapeHTML(m.group(1))
1969 'title': video_title,
1972 'duration': video_duration,
1973 'thumbnail': thumbnail,
1978 class BlipTVIE(InfoExtractor):
1979 """Information extractor for blip.tv"""
1981 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1982 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1983 IE_NAME = u'blip.tv'
1985 def report_direct_download(self, title):
1986 """Report information extraction."""
1987 self.to_screen(u'%s: Direct download detected' % title)
1989 def _real_extract(self, url):
1990 mobj = re.match(self._VALID_URL, url)
1992 raise ExtractorError(u'Invalid URL: %s' % url)
1994 urlp = compat_urllib_parse_urlparse(url)
1995 if urlp.path.startswith('/play/'):
1996 request = compat_urllib_request.Request(url)
1997 response = compat_urllib_request.urlopen(request)
1998 redirecturl = response.geturl()
1999 rurlp = compat_urllib_parse_urlparse(redirecturl)
2000 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2001 url = 'http://blip.tv/a/a-' + file_id
2002 return self._real_extract(url)
2009 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2010 request = compat_urllib_request.Request(json_url)
2011 request.add_header('User-Agent', 'iTunes/10.6.1')
2012 self.report_extraction(mobj.group(1))
2015 urlh = compat_urllib_request.urlopen(request)
2016 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2017 basename = url.split('/')[-1]
2018 title,ext = os.path.splitext(basename)
2019 title = title.decode('UTF-8')
2020 ext = ext.replace('.', '')
2021 self.report_direct_download(title)
2026 'upload_date': None,
2031 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2032 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2033 if info is None: # Regular URL
2035 json_code_bytes = urlh.read()
2036 json_code = json_code_bytes.decode('utf-8')
2037 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2038 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2041 json_data = json.loads(json_code)
2042 if 'Post' in json_data:
2043 data = json_data['Post']
2047 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2048 video_url = data['media']['url']
2049 umobj = re.match(self._URL_EXT, video_url)
2051 raise ValueError('Can not determine filename extension')
2052 ext = umobj.group(1)
2055 'id': data['item_id'],
2057 'uploader': data['display_name'],
2058 'upload_date': upload_date,
2059 'title': data['title'],
2061 'format': data['media']['mimeType'],
2062 'thumbnail': data['thumbnailUrl'],
2063 'description': data['description'],
2064 'player_url': data['embedUrl'],
2065 'user_agent': 'iTunes/10.6.1',
2067 except (ValueError,KeyError) as err:
2068 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2073 class MyVideoIE(InfoExtractor):
2074 """Information Extractor for myvideo.de."""
2076 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2077 IE_NAME = u'myvideo'
2079 def _real_extract(self,url):
2080 mobj = re.match(self._VALID_URL, url)
2082 raise ExtractorError(u'Invalid URL: %s' % url)
2084 video_id = mobj.group(1)
2087 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2088 webpage = self._download_webpage(webpage_url, video_id)
2090 self.report_extraction(video_id)
2091 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2094 raise ExtractorError(u'Unable to extract media URL')
2095 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2097 mobj = re.search('<title>([^<]+)</title>', webpage)
2099 raise ExtractorError(u'Unable to extract title')
2101 video_title = mobj.group(1)
2107 'upload_date': None,
2108 'title': video_title,
2112 class ComedyCentralIE(InfoExtractor):
2113 """Information extractor for The Daily Show and Colbert Report """
2115 # urls can be abbreviations like :thedailyshow or :colbert
2116 # urls for episodes like:
2117 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2118 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2119 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2120 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2121 |(https?://)?(www\.)?
2122 (?P<showname>thedailyshow|colbertnation)\.com/
2123 (full-episodes/(?P<episode>.*)|
2125 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2126 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2129 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2131 _video_extensions = {
2139 _video_dimensions = {
2149 def suitable(cls, url):
2150 """Receives a URL and returns True if suitable for this IE."""
2151 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2153 def _print_formats(self, formats):
2154 print('Available formats:')
2156 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2159 def _real_extract(self, url):
2160 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2162 raise ExtractorError(u'Invalid URL: %s' % url)
2164 if mobj.group('shortname'):
2165 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2166 url = u'http://www.thedailyshow.com/full-episodes/'
2168 url = u'http://www.colbertnation.com/full-episodes/'
2169 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2170 assert mobj is not None
2172 if mobj.group('clip'):
2173 if mobj.group('showname') == 'thedailyshow':
2174 epTitle = mobj.group('tdstitle')
2176 epTitle = mobj.group('cntitle')
2179 dlNewest = not mobj.group('episode')
2181 epTitle = mobj.group('showname')
2183 epTitle = mobj.group('episode')
2185 self.report_extraction(epTitle)
2186 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2188 url = htmlHandle.geturl()
2189 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2191 raise ExtractorError(u'Invalid redirected URL: ' + url)
2192 if mobj.group('episode') == '':
2193 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2194 epTitle = mobj.group('episode')
2196 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2198 if len(mMovieParams) == 0:
2199 # The Colbert Report embeds the information in a without
2200 # a URL prefix; so extract the alternate reference
2201 # and then add the URL prefix manually.
2203 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2204 if len(altMovieParams) == 0:
2205 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2207 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2209 uri = mMovieParams[0][1]
2210 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2211 indexXml = self._download_webpage(indexUrl, epTitle,
2212 u'Downloading show index',
2213 u'unable to download episode index')
2217 idoc = xml.etree.ElementTree.fromstring(indexXml)
2218 itemEls = idoc.findall('.//item')
2219 for partNum,itemEl in enumerate(itemEls):
2220 mediaId = itemEl.findall('./guid')[0].text
2221 shortMediaId = mediaId.split(':')[-1]
2222 showId = mediaId.split(':')[-2].replace('.com', '')
2223 officialTitle = itemEl.findall('./title')[0].text
2224 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2226 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2227 compat_urllib_parse.urlencode({'uri': mediaId}))
2228 configXml = self._download_webpage(configUrl, epTitle,
2229 u'Downloading configuration for %s' % shortMediaId)
2231 cdoc = xml.etree.ElementTree.fromstring(configXml)
2233 for rendition in cdoc.findall('.//rendition'):
2234 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2238 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2241 if self._downloader.params.get('listformats', None):
2242 self._print_formats([i[0] for i in turls])
2245 # For now, just pick the highest bitrate
2246 format,rtmp_video_url = turls[-1]
2248 # Get the format arg from the arg stream
2249 req_format = self._downloader.params.get('format', None)
2251 # Select format if we can find one
2254 format, rtmp_video_url = f, v
2257 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2259 raise ExtractorError(u'Cannot transform RTMP url')
2260 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2261 video_url = base + m.group('finalid')
2263 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2268 'upload_date': officialDate,
2273 'description': officialTitle,
2275 results.append(info)
2280 class EscapistIE(InfoExtractor):
2281 """Information extractor for The Escapist """
2283 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2284 IE_NAME = u'escapist'
2286 def _real_extract(self, url):
2287 mobj = re.match(self._VALID_URL, url)
2289 raise ExtractorError(u'Invalid URL: %s' % url)
2290 showName = mobj.group('showname')
2291 videoId = mobj.group('episode')
2293 self.report_extraction(showName)
2294 webPage = self._download_webpage(url, showName)
2296 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2297 description = unescapeHTML(descMatch.group(1))
2298 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2299 imgUrl = unescapeHTML(imgMatch.group(1))
2300 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2301 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2302 configUrlMatch = re.search('config=(.*)$', playerUrl)
2303 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2305 configJSON = self._download_webpage(configUrl, showName,
2306 u'Downloading configuration',
2307 u'unable to download configuration')
2309 # Technically, it's JavaScript, not JSON
2310 configJSON = configJSON.replace("'", '"')
2313 config = json.loads(configJSON)
2314 except (ValueError,) as err:
2315 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2317 playlist = config['playlist']
2318 videoUrl = playlist[1]['url']
2323 'uploader': showName,
2324 'upload_date': None,
2327 'thumbnail': imgUrl,
2328 'description': description,
2329 'player_url': playerUrl,
2334 class CollegeHumorIE(InfoExtractor):
2335 """Information extractor for collegehumor.com"""
2338 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2339 IE_NAME = u'collegehumor'
2341 def report_manifest(self, video_id):
2342 """Report information extraction."""
2343 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2345 def _real_extract(self, url):
2346 mobj = re.match(self._VALID_URL, url)
2348 raise ExtractorError(u'Invalid URL: %s' % url)
2349 video_id = mobj.group('videoid')
2354 'upload_date': None,
2357 self.report_extraction(video_id)
2358 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2360 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2361 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2362 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2364 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2366 videoNode = mdoc.findall('./video')[0]
2367 info['description'] = videoNode.findall('./description')[0].text
2368 info['title'] = videoNode.findall('./caption')[0].text
2369 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2370 manifest_url = videoNode.findall('./file')[0].text
2372 raise ExtractorError(u'Invalid metadata XML file')
2374 manifest_url += '?hdcore=2.10.3'
2375 self.report_manifest(video_id)
2377 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2378 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2379 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2381 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2383 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2384 node_id = media_node.attrib['url']
2385 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2386 except IndexError as err:
2387 raise ExtractorError(u'Invalid manifest file')
2389 url_pr = compat_urllib_parse_urlparse(manifest_url)
2390 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2397 class XVideosIE(InfoExtractor):
2398 """Information extractor for xvideos.com"""
2400 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2401 IE_NAME = u'xvideos'
2403 def _real_extract(self, url):
2404 mobj = re.match(self._VALID_URL, url)
2406 raise ExtractorError(u'Invalid URL: %s' % url)
2407 video_id = mobj.group(1)
2409 webpage = self._download_webpage(url, video_id)
2411 self.report_extraction(video_id)
2415 mobj = re.search(r'flv_url=(.+?)&', webpage)
2417 raise ExtractorError(u'Unable to extract video url')
2418 video_url = compat_urllib_parse.unquote(mobj.group(1))
2422 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2424 raise ExtractorError(u'Unable to extract video title')
2425 video_title = mobj.group(1)
2428 # Extract video thumbnail
2429 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2431 raise ExtractorError(u'Unable to extract video thumbnail')
2432 video_thumbnail = mobj.group(0)
2438 'upload_date': None,
2439 'title': video_title,
2441 'thumbnail': video_thumbnail,
2442 'description': None,
2448 class SoundcloudIE(InfoExtractor):
2449 """Information extractor for soundcloud.com
2450 To access the media, the uid of the song and a stream token
2451 must be extracted from the page source and the script must make
2452 a request to media.soundcloud.com/crossdomain.xml. Then
2453 the media can be grabbed by requesting from an url composed
2454 of the stream token and uid
2457 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2458 IE_NAME = u'soundcloud'
2460 def report_resolve(self, video_id):
2461 """Report information extraction."""
2462 self.to_screen(u'%s: Resolving id' % video_id)
2464 def _real_extract(self, url):
2465 mobj = re.match(self._VALID_URL, url)
2467 raise ExtractorError(u'Invalid URL: %s' % url)
2469 # extract uploader (which is in the url)
2470 uploader = mobj.group(1)
2471 # extract simple title (uploader + slug of song title)
2472 slug_title = mobj.group(2)
2473 simple_title = uploader + u'-' + slug_title
2474 full_title = '%s/%s' % (uploader, slug_title)
2476 self.report_resolve(full_title)
2478 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2479 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2480 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2482 info = json.loads(info_json)
2483 video_id = info['id']
2484 self.report_extraction(full_title)
2486 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2487 stream_json = self._download_webpage(streams_url, full_title,
2488 u'Downloading stream definitions',
2489 u'unable to download stream definitions')
2491 streams = json.loads(stream_json)
2492 mediaURL = streams['http_mp3_128_url']
2493 upload_date = unified_strdate(info['created_at'])
2498 'uploader': info['user']['username'],
2499 'upload_date': upload_date,
2500 'title': info['title'],
2502 'description': info['description'],
2505 class SoundcloudSetIE(InfoExtractor):
2506 """Information extractor for soundcloud.com sets
2507 To access the media, the uid of the song and a stream token
2508 must be extracted from the page source and the script must make
2509 a request to media.soundcloud.com/crossdomain.xml. Then
2510 the media can be grabbed by requesting from an url composed
2511 of the stream token and uid
2514 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2515 IE_NAME = u'soundcloud:set'
2517 def report_resolve(self, video_id):
2518 """Report information extraction."""
2519 self.to_screen(u'%s: Resolving id' % video_id)
2521 def _real_extract(self, url):
2522 mobj = re.match(self._VALID_URL, url)
2524 raise ExtractorError(u'Invalid URL: %s' % url)
2526 # extract uploader (which is in the url)
2527 uploader = mobj.group(1)
2528 # extract simple title (uploader + slug of song title)
2529 slug_title = mobj.group(2)
2530 simple_title = uploader + u'-' + slug_title
2531 full_title = '%s/sets/%s' % (uploader, slug_title)
2533 self.report_resolve(full_title)
2535 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2536 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2537 info_json = self._download_webpage(resolv_url, full_title)
2540 info = json.loads(info_json)
2541 if 'errors' in info:
2542 for err in info['errors']:
2543 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2546 self.report_extraction(full_title)
2547 for track in info['tracks']:
2548 video_id = track['id']
2550 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2551 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2553 self.report_extraction(video_id)
2554 streams = json.loads(stream_json)
2555 mediaURL = streams['http_mp3_128_url']
2560 'uploader': track['user']['username'],
2561 'upload_date': unified_strdate(track['created_at']),
2562 'title': track['title'],
2564 'description': track['description'],
2569 class InfoQIE(InfoExtractor):
2570 """Information extractor for infoq.com"""
2571 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2573 def _real_extract(self, url):
2574 mobj = re.match(self._VALID_URL, url)
2576 raise ExtractorError(u'Invalid URL: %s' % url)
2578 webpage = self._download_webpage(url, video_id=url)
2579 self.report_extraction(url)
2582 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2584 raise ExtractorError(u'Unable to extract video url')
2585 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2586 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2589 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2591 raise ExtractorError(u'Unable to extract video title')
2592 video_title = mobj.group(1)
2594 # Extract description
2595 video_description = u'No description available.'
2596 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2597 if mobj is not None:
2598 video_description = mobj.group(1)
2600 video_filename = video_url.split('/')[-1]
2601 video_id, extension = video_filename.split('.')
2607 'upload_date': None,
2608 'title': video_title,
2609 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2611 'description': video_description,
2616 class MixcloudIE(InfoExtractor):
2617 """Information extractor for www.mixcloud.com"""
2619 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2620 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2621 IE_NAME = u'mixcloud'
2623 def report_download_json(self, file_id):
2624 """Report JSON download."""
2625 self.to_screen(u'Downloading json')
2627 def get_urls(self, jsonData, fmt, bitrate='best'):
2628 """Get urls from 'audio_formats' section in json"""
2631 bitrate_list = jsonData[fmt]
2632 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2633 bitrate = max(bitrate_list) # select highest
2635 url_list = jsonData[fmt][bitrate]
2636 except TypeError: # we have no bitrate info.
2637 url_list = jsonData[fmt]
2640 def check_urls(self, url_list):
2641 """Returns 1st active url from list"""
2642 for url in url_list:
2644 compat_urllib_request.urlopen(url)
2646 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2651 def _print_formats(self, formats):
2652 print('Available formats:')
2653 for fmt in formats.keys():
2654 for b in formats[fmt]:
2656 ext = formats[fmt][b][0]
2657 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2658 except TypeError: # we have no bitrate info
2659 ext = formats[fmt][0]
2660 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2663 def _real_extract(self, url):
2664 mobj = re.match(self._VALID_URL, url)
2666 raise ExtractorError(u'Invalid URL: %s' % url)
2667 # extract uploader & filename from url
2668 uploader = mobj.group(1).decode('utf-8')
2669 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2671 # construct API request
2672 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2673 # retrieve .json file with links to files
2674 request = compat_urllib_request.Request(file_url)
2676 self.report_download_json(file_url)
2677 jsonData = compat_urllib_request.urlopen(request).read()
2678 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2679 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2682 json_data = json.loads(jsonData)
2683 player_url = json_data['player_swf_url']
2684 formats = dict(json_data['audio_formats'])
2686 req_format = self._downloader.params.get('format', None)
2689 if self._downloader.params.get('listformats', None):
2690 self._print_formats(formats)
2693 if req_format is None or req_format == 'best':
2694 for format_param in formats.keys():
2695 url_list = self.get_urls(formats, format_param)
2697 file_url = self.check_urls(url_list)
2698 if file_url is not None:
2701 if req_format not in formats:
2702 raise ExtractorError(u'Format is not available')
2704 url_list = self.get_urls(formats, req_format)
2705 file_url = self.check_urls(url_list)
2706 format_param = req_format
2709 'id': file_id.decode('utf-8'),
2710 'url': file_url.decode('utf-8'),
2711 'uploader': uploader.decode('utf-8'),
2712 'upload_date': None,
2713 'title': json_data['name'],
2714 'ext': file_url.split('.')[-1].decode('utf-8'),
2715 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2716 'thumbnail': json_data['thumbnail_url'],
2717 'description': json_data['description'],
2718 'player_url': player_url.decode('utf-8'),
2721 class StanfordOpenClassroomIE(InfoExtractor):
2722 """Information extractor for Stanford's Open ClassRoom"""
2724 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2725 IE_NAME = u'stanfordoc'
2727 def _real_extract(self, url):
2728 mobj = re.match(self._VALID_URL, url)
2730 raise ExtractorError(u'Invalid URL: %s' % url)
2732 if mobj.group('course') and mobj.group('video'): # A specific video
2733 course = mobj.group('course')
2734 video = mobj.group('video')
2736 'id': course + '_' + video,
2738 'upload_date': None,
2741 self.report_extraction(info['id'])
2742 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2743 xmlUrl = baseUrl + video + '.xml'
2745 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2746 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2748 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2750 info['title'] = mdoc.findall('./title')[0].text
2751 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2753 raise ExtractorError(u'Invalid metadata XML file')
2754 info['ext'] = info['url'].rpartition('.')[2]
2756 elif mobj.group('course'): # A course page
2757 course = mobj.group('course')
2762 'upload_date': None,
2765 coursepage = self._download_webpage(url, info['id'],
2766 note='Downloading course info page',
2767 errnote='Unable to download course info page')
2769 m = re.search('<h1>([^<]+)</h1>', coursepage)
2771 info['title'] = unescapeHTML(m.group(1))
2773 info['title'] = info['id']
2775 m = re.search('<description>([^<]+)</description>', coursepage)
2777 info['description'] = unescapeHTML(m.group(1))
2779 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2782 'type': 'reference',
2783 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2787 for entry in info['list']:
2788 assert entry['type'] == 'reference'
2789 results += self.extract(entry['url'])
2793 'id': 'Stanford OpenClassroom',
2796 'upload_date': None,
2799 self.report_download_webpage(info['id'])
2800 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2802 rootpage = compat_urllib_request.urlopen(rootURL).read()
2803 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2804 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2806 info['title'] = info['id']
2808 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2811 'type': 'reference',
2812 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2817 for entry in info['list']:
2818 assert entry['type'] == 'reference'
2819 results += self.extract(entry['url'])
2822 class MTVIE(InfoExtractor):
2823 """Information extractor for MTV.com"""
2825 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2828 def _real_extract(self, url):
2829 mobj = re.match(self._VALID_URL, url)
2831 raise ExtractorError(u'Invalid URL: %s' % url)
2832 if not mobj.group('proto'):
2833 url = 'http://' + url
2834 video_id = mobj.group('videoid')
2836 webpage = self._download_webpage(url, video_id)
2838 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2840 raise ExtractorError(u'Unable to extract song name')
2841 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2842 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2844 raise ExtractorError(u'Unable to extract performer')
2845 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2846 video_title = performer + ' - ' + song_name
2848 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2850 raise ExtractorError(u'Unable to mtvn_uri')
2851 mtvn_uri = mobj.group(1)
2853 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2855 raise ExtractorError(u'Unable to extract content id')
2856 content_id = mobj.group(1)
2858 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2859 self.report_extraction(video_id)
2860 request = compat_urllib_request.Request(videogen_url)
2862 metadataXml = compat_urllib_request.urlopen(request).read()
2863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2866 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2867 renditions = mdoc.findall('.//rendition')
2869 # For now, always pick the highest quality.
2870 rendition = renditions[-1]
2873 _,_,ext = rendition.attrib['type'].partition('/')
2874 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2875 video_url = rendition.find('./src').text
2877 raise ExtractorError('Invalid rendition field.')
2882 'uploader': performer,
2883 'upload_date': None,
2884 'title': video_title,
2892 class YoukuIE(InfoExtractor):
2893 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2896 nowTime = int(time.time() * 1000)
2897 random1 = random.randint(1000,1998)
2898 random2 = random.randint(1000,9999)
2900 return "%d%d%d" %(nowTime,random1,random2)
2902 def _get_file_ID_mix_string(self, seed):
2904 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2906 for i in range(len(source)):
2907 seed = (seed * 211 + 30031 ) % 65536
2908 index = math.floor(seed / 65536 * len(source) )
2909 mixed.append(source[int(index)])
2910 source.remove(source[int(index)])
2911 #return ''.join(mixed)
2914 def _get_file_id(self, fileId, seed):
2915 mixed = self._get_file_ID_mix_string(seed)
2916 ids = fileId.split('*')
2920 realId.append(mixed[int(ch)])
2921 return ''.join(realId)
2923 def _real_extract(self, url):
2924 mobj = re.match(self._VALID_URL, url)
2926 raise ExtractorError(u'Invalid URL: %s' % url)
2927 video_id = mobj.group('ID')
2929 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2931 jsondata = self._download_webpage(info_url, video_id)
2933 self.report_extraction(video_id)
2935 config = json.loads(jsondata)
2937 video_title = config['data'][0]['title']
2938 seed = config['data'][0]['seed']
2940 format = self._downloader.params.get('format', None)
2941 supported_format = list(config['data'][0]['streamfileids'].keys())
2943 if format is None or format == 'best':
2944 if 'hd2' in supported_format:
2949 elif format == 'worst':
2957 fileid = config['data'][0]['streamfileids'][format]
2958 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2959 except (UnicodeDecodeError, ValueError, KeyError):
2960 raise ExtractorError(u'Unable to extract info section')
2963 sid = self._gen_sid()
2964 fileid = self._get_file_id(fileid, seed)
2966 #column 8,9 of fileid represent the segment number
2967 #fileid[7:9] should be changed
2968 for index, key in enumerate(keys):
2970 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2971 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2974 'id': '%s_part%02d' % (video_id, index),
2975 'url': download_url,
2977 'upload_date': None,
2978 'title': video_title,
2981 files_info.append(info)
2986 class XNXXIE(InfoExtractor):
2987 """Information extractor for xnxx.com"""
2989 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2991 VIDEO_URL_RE = r'flv_url=(.*?)&'
2992 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2993 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2995 def _real_extract(self, url):
2996 mobj = re.match(self._VALID_URL, url)
2998 raise ExtractorError(u'Invalid URL: %s' % url)
2999 video_id = mobj.group(1)
3001 # Get webpage content
3002 webpage = self._download_webpage(url, video_id)
3004 result = re.search(self.VIDEO_URL_RE, webpage)
3006 raise ExtractorError(u'Unable to extract video url')
3007 video_url = compat_urllib_parse.unquote(result.group(1))
3009 result = re.search(self.VIDEO_TITLE_RE, webpage)
3011 raise ExtractorError(u'Unable to extract video title')
3012 video_title = result.group(1)
3014 result = re.search(self.VIDEO_THUMB_RE, webpage)
3016 raise ExtractorError(u'Unable to extract video thumbnail')
3017 video_thumbnail = result.group(1)
3023 'upload_date': None,
3024 'title': video_title,
3026 'thumbnail': video_thumbnail,
3027 'description': None,
3031 class GooglePlusIE(InfoExtractor):
3032 """Information extractor for plus.google.com."""
3034 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3035 IE_NAME = u'plus.google'
3037 def report_extract_entry(self, url):
3038 """Report downloading extry"""
3039 self.to_screen(u'Downloading entry: %s' % url)
3041 def report_date(self, upload_date):
3042 """Report downloading extry"""
3043 self.to_screen(u'Entry date: %s' % upload_date)
3045 def report_uploader(self, uploader):
3046 """Report downloading extry"""
3047 self.to_screen(u'Uploader: %s' % uploader)
3049 def report_title(self, video_title):
3050 """Report downloading extry"""
3051 self.to_screen(u'Title: %s' % video_title)
3053 def report_extract_vid_page(self, video_page):
3054 """Report information extraction."""
3055 self.to_screen(u'Extracting video page: %s' % video_page)
3057 def _real_extract(self, url):
3058 # Extract id from URL
3059 mobj = re.match(self._VALID_URL, url)
3061 raise ExtractorError(u'Invalid URL: %s' % url)
3063 post_url = mobj.group(0)
3064 video_id = mobj.group(1)
3066 video_extension = 'flv'
3068 # Step 1, Retrieve post webpage to extract further information
3069 self.report_extract_entry(post_url)
3070 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3072 # Extract update date
3074 pattern = 'title="Timestamp">(.*?)</a>'
3075 mobj = re.search(pattern, webpage)
3077 upload_date = mobj.group(1)
3078 # Convert timestring to a format suitable for filename
3079 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3080 upload_date = upload_date.strftime('%Y%m%d')
3081 self.report_date(upload_date)
3085 pattern = r'rel\="author".*?>(.*?)</a>'
3086 mobj = re.search(pattern, webpage)
3088 uploader = mobj.group(1)
3089 self.report_uploader(uploader)
3092 # Get the first line for title
3094 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3095 mobj = re.search(pattern, webpage)
3097 video_title = mobj.group(1)
3098 self.report_title(video_title)
3100 # Step 2, Stimulate clicking the image box to launch video
3101 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3102 mobj = re.search(pattern, webpage)
3104 raise ExtractorError(u'Unable to extract video page URL')
3106 video_page = mobj.group(1)
3107 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3108 self.report_extract_vid_page(video_page)
3111 # Extract video links on video page
3112 """Extract video links of all sizes"""
3113 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3114 mobj = re.findall(pattern, webpage)
3116 raise ExtractorError(u'Unable to extract video links')
3118 # Sort in resolution
3119 links = sorted(mobj)
3121 # Choose the lowest of the sort, i.e. highest resolution
3122 video_url = links[-1]
3123 # Only get the url. The resolution part in the tuple has no use anymore
3124 video_url = video_url[-1]
3125 # Treat escaped \u0026 style hex
3127 video_url = video_url.decode("unicode_escape")
3128 except AttributeError: # Python 3
3129 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3135 'uploader': uploader,
3136 'upload_date': upload_date,
3137 'title': video_title,
3138 'ext': video_extension,
3141 class NBAIE(InfoExtractor):
3142 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3145 def _real_extract(self, url):
3146 mobj = re.match(self._VALID_URL, url)
3148 raise ExtractorError(u'Invalid URL: %s' % url)
3150 video_id = mobj.group(1)
3151 if video_id.endswith('/index.html'):
3152 video_id = video_id[:-len('/index.html')]
3154 webpage = self._download_webpage(url, video_id)
3156 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3157 def _findProp(rexp, default=None):
3158 m = re.search(rexp, webpage)
3160 return unescapeHTML(m.group(1))
3164 shortened_video_id = video_id.rpartition('/')[2]
3165 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3167 'id': shortened_video_id,
3171 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3172 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3176 class JustinTVIE(InfoExtractor):
3177 """Information extractor for justin.tv and twitch.tv"""
3178 # TODO: One broadcast may be split into multiple videos. The key
3179 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3180 # starts at 1 and increases. Can we treat all parts as one video?
3182 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3184 (?P<channelid>[^/]+)|
3185 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3186 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3190 _JUSTIN_PAGE_LIMIT = 100
3191 IE_NAME = u'justin.tv'
3193 def report_download_page(self, channel, offset):
3194 """Report attempt to download a single page of videos."""
3195 self.to_screen(u'%s: Downloading video information from %d to %d' %
3196 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3198 # Return count of items, list of *valid* items
3199 def _parse_page(self, url, video_id):
3200 webpage = self._download_webpage(url, video_id,
3201 u'Downloading video info JSON',
3202 u'unable to download video info JSON')
3204 response = json.loads(webpage)
3205 if type(response) != list:
3206 error_text = response.get('error', 'unknown error')
3207 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3209 for clip in response:
3210 video_url = clip['video_file_url']
3212 video_extension = os.path.splitext(video_url)[1][1:]
3213 video_date = re.sub('-', '', clip['start_time'][:10])
3214 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3215 video_id = clip['id']
3216 video_title = clip.get('title', video_id)
3220 'title': video_title,
3221 'uploader': clip.get('channel_name', video_uploader_id),
3222 'uploader_id': video_uploader_id,
3223 'upload_date': video_date,
3224 'ext': video_extension,
3226 return (len(response), info)
3228 def _real_extract(self, url):
3229 mobj = re.match(self._VALID_URL, url)
3231 raise ExtractorError(u'invalid URL: %s' % url)
3233 api_base = 'http://api.justin.tv'
3235 if mobj.group('channelid'):
3237 video_id = mobj.group('channelid')
3238 api = api_base + '/channel/archives/%s.json' % video_id
3239 elif mobj.group('chapterid'):
3240 chapter_id = mobj.group('chapterid')
3242 webpage = self._download_webpage(url, chapter_id)
3243 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3245 raise ExtractorError(u'Cannot find archive of a chapter')
3246 archive_id = m.group(1)
3248 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3249 chapter_info_xml = self._download_webpage(api, chapter_id,
3250 note=u'Downloading chapter information',
3251 errnote=u'Chapter information download failed')
3252 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3253 for a in doc.findall('.//archive'):
3254 if archive_id == a.find('./id').text:
3257 raise ExtractorError(u'Could not find chapter in chapter information')
3259 video_url = a.find('./video_file_url').text
3260 video_ext = video_url.rpartition('.')[2] or u'flv'
3262 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3263 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3264 note='Downloading chapter metadata',
3265 errnote='Download of chapter metadata failed')
3266 chapter_info = json.loads(chapter_info_json)
3268 bracket_start = int(doc.find('.//bracket_start').text)
3269 bracket_end = int(doc.find('.//bracket_end').text)
3271 # TODO determine start (and probably fix up file)
3272 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3273 #video_url += u'?start=' + TODO:start_timestamp
3274 # bracket_start is 13290, but we want 51670615
3275 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3276 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3279 'id': u'c' + chapter_id,
3282 'title': chapter_info['title'],
3283 'thumbnail': chapter_info['preview'],
3284 'description': chapter_info['description'],
3285 'uploader': chapter_info['channel']['display_name'],
3286 'uploader_id': chapter_info['channel']['name'],
3290 video_id = mobj.group('videoid')
3291 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3293 self.report_extraction(video_id)
3297 limit = self._JUSTIN_PAGE_LIMIT
3300 self.report_download_page(video_id, offset)
3301 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3302 page_count, page_info = self._parse_page(page_url, video_id)
3303 info.extend(page_info)
3304 if not paged or page_count != limit:
3309 class FunnyOrDieIE(InfoExtractor):
3310 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3312 def _real_extract(self, url):
3313 mobj = re.match(self._VALID_URL, url)
3315 raise ExtractorError(u'invalid URL: %s' % url)
3317 video_id = mobj.group('id')
3318 webpage = self._download_webpage(url, video_id)
3320 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3322 raise ExtractorError(u'Unable to find video information')
3323 video_url = unescapeHTML(m.group('url'))
3325 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3327 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3329 raise ExtractorError(u'Cannot find video title')
3330 title = clean_html(m.group('title'))
3332 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3334 desc = unescapeHTML(m.group('desc'))
3343 'description': desc,
3347 class SteamIE(InfoExtractor):
3348 _VALID_URL = r"""http://store\.steampowered\.com/
3350 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3352 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3356 def suitable(cls, url):
3357 """Receives a URL and returns True if suitable for this IE."""
3358 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3360 def _real_extract(self, url):
3361 m = re.match(self._VALID_URL, url, re.VERBOSE)
3362 gameID = m.group('gameID')
3363 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3364 self.report_age_confirmation()
3365 webpage = self._download_webpage(videourl, gameID)
3366 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3368 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3369 mweb = re.finditer(urlRE, webpage)
3370 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3371 titles = re.finditer(namesRE, webpage)
3372 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3373 thumbs = re.finditer(thumbsRE, webpage)
3375 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3376 video_id = vid.group('videoID')
3377 title = vtitle.group('videoName')
3378 video_url = vid.group('videoURL')
3379 video_thumb = thumb.group('thumbnail')
3381 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3386 'title': unescapeHTML(title),
3387 'thumbnail': video_thumb
3390 return [self.playlist_result(videos, gameID, game_title)]
3392 class UstreamIE(InfoExtractor):
3393 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3394 IE_NAME = u'ustream'
3396 def _real_extract(self, url):
3397 m = re.match(self._VALID_URL, url)
3398 video_id = m.group('videoID')
3399 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3400 webpage = self._download_webpage(url, video_id)
3401 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3402 title = m.group('title')
3403 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3404 uploader = m.group('uploader')
3410 'uploader': uploader
3414 class WorldStarHipHopIE(InfoExtractor):
3415 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3416 IE_NAME = u'WorldStarHipHop'
3418 def _real_extract(self, url):
3419 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3421 m = re.match(self._VALID_URL, url)
3422 video_id = m.group('id')
3424 webpage_src = self._download_webpage(url, video_id)
3426 mobj = re.search(_src_url, webpage_src)
3428 if mobj is not None:
3429 video_url = mobj.group(1)
3430 if 'mp4' in video_url:
3435 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3437 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3440 raise ExtractorError(u'Cannot determine title')
3441 title = mobj.group(1)
3443 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3444 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3445 if mobj is not None:
3446 thumbnail = mobj.group(1)
3448 _title = r"""candytitles.*>(.*)</span>"""
3449 mobj = re.search(_title, webpage_src)
3450 if mobj is not None:
3451 title = mobj.group(1)
3458 'thumbnail' : thumbnail,
3463 class RBMARadioIE(InfoExtractor):
3464 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3466 def _real_extract(self, url):
3467 m = re.match(self._VALID_URL, url)
3468 video_id = m.group('videoID')
3470 webpage = self._download_webpage(url, video_id)
3471 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3473 raise ExtractorError(u'Cannot find metadata')
3474 json_data = m.group(1)
3477 data = json.loads(json_data)
3478 except ValueError as e:
3479 raise ExtractorError(u'Invalid JSON: ' + str(e))
3481 video_url = data['akamai_url'] + '&cbr=256'
3482 url_parts = compat_urllib_parse_urlparse(video_url)
3483 video_ext = url_parts.path.rpartition('.')[2]
3488 'title': data['title'],
3489 'description': data.get('teaser_text'),
3490 'location': data.get('country_of_origin'),
3491 'uploader': data.get('host', {}).get('name'),
3492 'uploader_id': data.get('host', {}).get('slug'),
3493 'thumbnail': data.get('image', {}).get('large_url_2x'),
3494 'duration': data.get('duration'),
3499 class YouPornIE(InfoExtractor):
3500 """Information extractor for youporn.com."""
3501 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3503 def _print_formats(self, formats):
3504 """Print all available formats"""
3505 print(u'Available formats:')
3506 print(u'ext\t\tformat')
3507 print(u'---------------------------------')
3508 for format in formats:
3509 print(u'%s\t\t%s' % (format['ext'], format['format']))
3511 def _specific(self, req_format, formats):
3513 if(x["format"]==req_format):
3517 def _real_extract(self, url):
3518 mobj = re.match(self._VALID_URL, url)
3520 raise ExtractorError(u'Invalid URL: %s' % url)
3522 video_id = mobj.group('videoid')
3524 req = compat_urllib_request.Request(url)
3525 req.add_header('Cookie', 'age_verified=1')
3526 webpage = self._download_webpage(req, video_id)
3528 # Get the video title
3529 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3531 raise ExtractorError(u'Unable to extract video title')
3532 video_title = result.group('title').strip()
3534 # Get the video date
3535 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3537 self._downloader.report_warning(u'unable to extract video date')
3540 upload_date = unified_strdate(result.group('date').strip())
3542 # Get the video uploader
3543 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3545 self._downloader.report_warning(u'unable to extract uploader')
3546 video_uploader = None
3548 video_uploader = result.group('uploader').strip()
3549 video_uploader = clean_html( video_uploader )
3551 # Get all of the formats available
3552 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3553 result = re.search(DOWNLOAD_LIST_RE, webpage)
3555 raise ExtractorError(u'Unable to extract download list')
3556 download_list_html = result.group('download_list').strip()
3558 # Get all of the links from the page
3559 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3560 links = re.findall(LINK_RE, download_list_html)
3561 if(len(links) == 0):
3562 raise ExtractorError(u'ERROR: no known formats available for video')
3564 self.to_screen(u'Links found: %d' % len(links))
3569 # A link looks like this:
3570 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3571 # A path looks like this:
3572 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3573 video_url = unescapeHTML( link )
3574 path = compat_urllib_parse_urlparse( video_url ).path
3575 extension = os.path.splitext( path )[1][1:]
3576 format = path.split('/')[4].split('_')[:2]
3579 format = "-".join( format )
3580 title = u'%s-%s-%s' % (video_title, size, bitrate)
3585 'uploader': video_uploader,
3586 'upload_date': upload_date,
3591 'description': None,
3595 if self._downloader.params.get('listformats', None):
3596 self._print_formats(formats)
3599 req_format = self._downloader.params.get('format', None)
3600 self.to_screen(u'Format: %s' % req_format)
3602 if req_format is None or req_format == 'best':
3604 elif req_format == 'worst':
3605 return [formats[-1]]
3606 elif req_format in ('-1', 'all'):
3609 format = self._specific( req_format, formats )
3611 raise ExtractorError(u'Requested format not available')
3616 class PornotubeIE(InfoExtractor):
3617 """Information extractor for pornotube.com."""
3618 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3620 def _real_extract(self, url):
3621 mobj = re.match(self._VALID_URL, url)
3623 raise ExtractorError(u'Invalid URL: %s' % url)
3625 video_id = mobj.group('videoid')
3626 video_title = mobj.group('title')
3628 # Get webpage content
3629 webpage = self._download_webpage(url, video_id)
3632 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3633 result = re.search(VIDEO_URL_RE, webpage)
3635 raise ExtractorError(u'Unable to extract video url')
3636 video_url = compat_urllib_parse.unquote(result.group('url'))
3638 #Get the uploaded date
3639 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3640 result = re.search(VIDEO_UPLOADED_RE, webpage)
3642 raise ExtractorError(u'Unable to extract video title')
3643 upload_date = unified_strdate(result.group('date'))
3645 info = {'id': video_id,
3648 'upload_date': upload_date,
3649 'title': video_title,
3655 class YouJizzIE(InfoExtractor):
3656 """Information extractor for youjizz.com."""
3657 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3659 def _real_extract(self, url):
3660 mobj = re.match(self._VALID_URL, url)
3662 raise ExtractorError(u'Invalid URL: %s' % url)
3664 video_id = mobj.group('videoid')
3666 # Get webpage content
3667 webpage = self._download_webpage(url, video_id)
3669 # Get the video title
3670 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3672 raise ExtractorError(u'ERROR: unable to extract video title')
3673 video_title = result.group('title').strip()
3675 # Get the embed page
3676 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3678 raise ExtractorError(u'ERROR: unable to extract embed page')
3680 embed_page_url = result.group(0).strip()
3681 video_id = result.group('videoid')
3683 webpage = self._download_webpage(embed_page_url, video_id)
3686 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3688 raise ExtractorError(u'ERROR: unable to extract video url')
3689 video_url = result.group('source')
3691 info = {'id': video_id,
3693 'title': video_title,
3696 'player_url': embed_page_url}
3700 class EightTracksIE(InfoExtractor):
3702 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3704 def _real_extract(self, url):
3705 mobj = re.match(self._VALID_URL, url)
3707 raise ExtractorError(u'Invalid URL: %s' % url)
3708 playlist_id = mobj.group('id')
3710 webpage = self._download_webpage(url, playlist_id)
3712 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3714 raise ExtractorError(u'Cannot find trax information')
3715 json_like = m.group(1)
3716 data = json.loads(json_like)
3718 session = str(random.randint(0, 1000000000))
3720 track_count = data['tracks_count']
3721 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3722 next_url = first_url
3724 for i in itertools.count():
3725 api_json = self._download_webpage(next_url, playlist_id,
3726 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3727 errnote=u'Failed to download song information')
3728 api_data = json.loads(api_json)
3729 track_data = api_data[u'set']['track']
3731 'id': track_data['id'],
3732 'url': track_data['track_file_stream_url'],
3733 'title': track_data['performer'] + u' - ' + track_data['name'],
3734 'raw_title': track_data['name'],
3735 'uploader_id': data['user']['login'],
3739 if api_data['set']['at_last_track']:
3741 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3744 class KeekIE(InfoExtractor):
3745 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3748 def _real_extract(self, url):
3749 m = re.match(self._VALID_URL, url)
3750 video_id = m.group('videoID')
3751 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3752 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3753 webpage = self._download_webpage(url, video_id)
3754 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3755 title = unescapeHTML(m.group('title'))
3756 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3757 uploader = clean_html(m.group('uploader'))
3763 'thumbnail': thumbnail,
3764 'uploader': uploader
3768 class TEDIE(InfoExtractor):
3769 _VALID_URL=r'''http://www\.ted\.com/
3771 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3773 ((?P<type_talk>talks)) # We have a simple talk
3775 (/lang/(.*?))? # The url may contain the language
3776 /(?P<name>\w+) # Here goes the name and then ".html"
3780 def suitable(cls, url):
3781 """Receives a URL and returns True if suitable for this IE."""
3782 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3784 def _real_extract(self, url):
3785 m=re.match(self._VALID_URL, url, re.VERBOSE)
3786 if m.group('type_talk'):
3787 return [self._talk_info(url)]
3789 playlist_id=m.group('playlist_id')
3790 name=m.group('name')
3791 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3792 return [self._playlist_videos_info(url,name,playlist_id)]
3794 def _talk_video_link(self,mediaSlug):
3795 '''Returns the video link for that mediaSlug'''
3796 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3798 def _playlist_videos_info(self,url,name,playlist_id=0):
3799 '''Returns the videos of the playlist'''
3801 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802 ([.\s]*?)data-playlist_item_id="(\d+)"
3803 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808 m_names=re.finditer(video_name_RE,webpage)
3810 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3811 m_playlist = re.search(playlist_RE, webpage)
3812 playlist_title = m_playlist.group('playlist_title')
3814 playlist_entries = []
3815 for m_video, m_name in zip(m_videos,m_names):
3816 video_id=m_video.group('video_id')
3817 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3818 playlist_entries.append(self.url_result(talk_url, 'TED'))
3819 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3821 def _talk_info(self, url, video_id=0):
3822 """Return the video for the talk in the url"""
3823 m=re.match(self._VALID_URL, url,re.VERBOSE)
3824 videoName=m.group('name')
3825 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3826 # If the url includes the language we get the title translated
3827 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3828 title=re.search(title_RE, webpage).group('title')
3829 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3830 "id":(?P<videoID>[\d]+).*?
3831 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3832 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3833 thumb_match=re.search(thumb_RE,webpage)
3834 info_match=re.search(info_RE,webpage,re.VERBOSE)
3835 video_id=info_match.group('videoID')
3836 mediaSlug=info_match.group('mediaSlug')
3837 video_url=self._talk_video_link(mediaSlug)
3843 'thumbnail': thumb_match.group('thumbnail')
3847 class MySpassIE(InfoExtractor):
3848 _VALID_URL = r'http://www.myspass.de/.*'
3850 def _real_extract(self, url):
3851 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853 # video id is the last path element of the URL
3854 # usually there is a trailing slash, so also try the second but last
3855 url_path = compat_urllib_parse_urlparse(url).path
3856 url_parent_path, video_id = os.path.split(url_path)
3858 _, video_id = os.path.split(url_parent_path)
3861 metadata_url = META_DATA_URL_TEMPLATE % video_id
3862 metadata_text = self._download_webpage(metadata_url, video_id)
3863 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865 # extract values from metadata
3866 url_flv_el = metadata.find('url_flv')
3867 if url_flv_el is None:
3868 raise ExtractorError(u'Unable to extract download url')
3869 video_url = url_flv_el.text
3870 extension = os.path.splitext(video_url)[1][1:]
3871 title_el = metadata.find('title')
3872 if title_el is None:
3873 raise ExtractorError(u'Unable to extract title')
3874 title = title_el.text
3875 format_id_el = metadata.find('format_id')
3876 if format_id_el is None:
3879 format = format_id_el.text
3880 description_el = metadata.find('description')
3881 if description_el is not None:
3882 description = description_el.text
3885 imagePreview_el = metadata.find('imagePreview')
3886 if imagePreview_el is not None:
3887 thumbnail = imagePreview_el.text
3896 'thumbnail': thumbnail,
3897 'description': description
3901 class SpiegelIE(InfoExtractor):
3902 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904 def _real_extract(self, url):
3905 m = re.match(self._VALID_URL, url)
3906 video_id = m.group('videoID')
3908 webpage = self._download_webpage(url, video_id)
3909 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3911 raise ExtractorError(u'Cannot find title')
3912 video_title = unescapeHTML(m.group(1))
3914 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3915 xml_code = self._download_webpage(xml_url, video_id,
3916 note=u'Downloading XML', errnote=u'Failed to download XML')
3918 idoc = xml.etree.ElementTree.fromstring(xml_code)
3919 last_type = idoc[-1]
3920 filename = last_type.findall('./filename')[0].text
3921 duration = float(last_type.findall('./duration')[0].text)
3923 video_url = 'http://video2.spiegel.de/flash/' + filename
3924 video_ext = filename.rpartition('.')[2]
3929 'title': video_title,
3930 'duration': duration,
3934 class LiveLeakIE(InfoExtractor):
3936 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3937 IE_NAME = u'liveleak'
3939 def _real_extract(self, url):
3940 mobj = re.match(self._VALID_URL, url)
3942 raise ExtractorError(u'Invalid URL: %s' % url)
3944 video_id = mobj.group('video_id')
3946 webpage = self._download_webpage(url, video_id)
3948 m = re.search(r'file: "(.*?)",', webpage)
3950 raise ExtractorError(u'Unable to find video url')
3951 video_url = m.group(1)
3953 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3955 raise ExtractorError(u'Cannot find video title')
3956 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3958 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3960 desc = unescapeHTML(m.group('desc'))
3964 m = re.search(r'By:.*?(\w+)</a>', webpage)
3966 uploader = clean_html(m.group(1))
3975 'description': desc,
3976 'uploader': uploader
3981 class ARDIE(InfoExtractor):
3982 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3983 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3984 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3986 def _real_extract(self, url):
3987 # determine video id from url
3988 m = re.match(self._VALID_URL, url)
3990 numid = re.search(r'documentId=([0-9]+)', url)
3992 video_id = numid.group(1)
3994 video_id = m.group('video_id')
3996 # determine title and media streams from webpage
3997 html = self._download_webpage(url, video_id)
3998 title = re.search(self._TITLE, html).group('title')
3999 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4001 assert '"fsk"' in html
4002 raise ExtractorError(u'This video is only available after 8:00 pm')
4004 # choose default media type and highest quality for now
4005 stream = max([s for s in streams if int(s["media_type"]) == 0],
4006 key=lambda s: int(s["quality"]))
4008 # there's two possibilities: RTMP stream or HTTP download
4009 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4010 if stream['rtmp_url']:
4011 self.to_screen(u'RTMP download detected')
4012 assert stream['video_url'].startswith('mp4:')
4013 info["url"] = stream["rtmp_url"]
4014 info["play_path"] = stream['video_url']
4016 assert stream["video_url"].endswith('.mp4')
4017 info["url"] = stream["video_url"]
4020 class TumblrIE(InfoExtractor):
4021 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4023 def _real_extract(self, url):
4024 m_url = re.match(self._VALID_URL, url)
4025 video_id = m_url.group('id')
4026 blog = m_url.group('blog_name')
4028 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4029 webpage = self._download_webpage(url, video_id)
4031 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4032 video = re.search(re_video, webpage)
4034 self.to_screen("No video founded")
4036 video_url = video.group('video_url')
4037 ext = video.group('ext')
4039 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4040 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4042 # The only place where you can get a title, it's not complete,
4043 # but searching in other places doesn't work for all videos
4044 re_title = r'<title>(?P<title>.*?)</title>'
4045 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4047 return [{'id': video_id,
4054 class BandcampIE(InfoExtractor):
4055 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4057 def _real_extract(self, url):
4058 mobj = re.match(self._VALID_URL, url)
4059 title = mobj.group('title')
4060 webpage = self._download_webpage(url, title)
4061 # We get the link to the free download page
4062 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4063 if m_download is None:
4064 raise ExtractorError(u'No free songs founded')
4066 download_link = m_download.group(1)
4067 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4068 webpage, re.MULTILINE|re.DOTALL).group('id')
4070 download_webpage = self._download_webpage(download_link, id,
4071 'Downloading free downloads page')
4072 # We get the dictionary of the track from some javascrip code
4073 info = re.search(r'items: (.*?),$',
4074 download_webpage, re.MULTILINE).group(1)
4075 info = json.loads(info)[0]
4076 # We pick mp3-320 for now, until format selection can be easily implemented.
4077 mp3_info = info[u'downloads'][u'mp3-320']
4078 # If we try to use this url it says the link has expired
4079 initial_url = mp3_info[u'url']
4080 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4081 m_url = re.match(re_url, initial_url)
4082 #We build the url we will use to get the final track url
4083 # This url is build in Bandcamp in the script download_bunde_*.js
4084 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4085 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4086 # If we could correctly generate the .rand field the url would be
4087 #in the "download_url" key
4088 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4090 track_info = {'id':id,
4091 'title' : info[u'title'],
4094 'thumbnail' : info[u'thumb_url'],
4095 'uploader' : info[u'artist']
4100 class RedTubeIE(InfoExtractor):
4101 """Information Extractor for redtube"""
4102 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4104 def _real_extract(self,url):
4105 mobj = re.match(self._VALID_URL, url)
4107 raise ExtractorError(u'Invalid URL: %s' % url)
4109 video_id = mobj.group('id')
4110 video_extension = 'mp4'
4111 webpage = self._download_webpage(url, video_id)
4112 self.report_extraction(video_id)
4113 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4116 raise ExtractorError(u'Unable to extract media URL')
4118 video_url = mobj.group(1)
4119 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4121 raise ExtractorError(u'Unable to extract title')
4122 video_title = mobj.group(1)
4127 'ext': video_extension,
4128 'title': video_title,
4131 class InaIE(InfoExtractor):
4132 """Information Extractor for Ina.fr"""
4133 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4135 def _real_extract(self,url):
4136 mobj = re.match(self._VALID_URL, url)
4138 video_id = mobj.group('id')
4139 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4140 video_extension = 'mp4'
4141 webpage = self._download_webpage(mrss_url, video_id)
4143 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4145 raise ExtractorError(u'Unable to extract media URL')
4146 video_url = mobj.group(1)
4148 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4150 raise ExtractorError(u'Unable to extract title')
4151 video_title = mobj.group(1)
4156 'ext': video_extension,
4157 'title': video_title,
4160 def gen_extractors():
4161 """ Return a list of an instance of every supported extractor.
4162 The order does matter; the first extractor matched is the one handling the URL.
4165 YoutubePlaylistIE(),
4190 StanfordOpenClassroomIE(),
4200 WorldStarHipHopIE(),
4220 def get_info_extractor(ie_name):
4221 """Returns the info extractor class with the given ie_name"""
4222 return globals()[ie_name+'IE']