2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
234 _video_dimensions = {
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
319 (error_message, sub_lang, sub)
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
328 url = 'http://www.youtube.com/api/timedtext?' + params
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
337 def _extract_subtitle(self, video_id):
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
369 def _print_formats(self, formats):
370 print('Available formats:')
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
399 request = compat_urllib_request.Request(self._LANG_URL)
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
407 # No authentication to be performed
411 request = compat_urllib_request.Request(self._LOGIN_URL)
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
422 galx = match.group(1)
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
434 u'PersistentCookie': u'yes',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
444 u'signIn': u'Sign in',
446 u'service': u'youtube',
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
468 'action_confirm': 'Confirm',
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
542 video_uploader_id = mobj.group(1)
544 self._downloader.report_warning(u'unable to extract uploader nickname')
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
572 video_description = unescapeHTML(fd_mobj.group(1))
574 video_description = u''
577 video_subtitles = None
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
582 (sub_error, sub_lang, sub) = video_subtitles[0]
584 self._downloader.report_error(sub_error)
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
614 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615 url_data = compat_parse_qs(url_data_str)
616 if 'itag' in url_data and 'url' in url_data:
617 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618 if not 'ratebypass' in url: url += '&ratebypass=yes'
619 url_map[url_data['itag'][0]] = url
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
629 raise ExtractorError(u'no known formats available for video')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
646 video_url_list = [(rf, url_map[rf])]
648 if video_url_list is None:
649 raise ExtractorError(u'requested format not available')
651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
654 for format_param, video_real_url in video_url_list:
656 video_extension = self._video_extensions.get(format_param, 'flv')
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
663 'url': video_real_url,
664 'uploader': video_uploader,
665 'uploader_id': video_uploader_id,
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
679 class MetacafeIE(InfoExtractor):
680 """Information Extractor for metacafe.com."""
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
687 def report_disclaimer(self):
688 """Report disclaimer retrieval."""
689 self.to_screen(u'Retrieving disclaimer')
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
712 def _real_extract(self, url):
713 # Extract id and simplified title from URL
714 mobj = re.match(self._VALID_URL, url)
716 raise ExtractorError(u'Invalid URL: %s' % url)
718 video_id = mobj.group(1)
720 # Check if video comes from YouTube
721 mobj2 = re.match(r'^yt-(.*)$', video_id)
722 if mobj2 is not None:
723 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
725 # Retrieve video webpage to extract further information
726 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
728 # Extract URL, uploader and title from webpage
729 self.report_extraction(video_id)
730 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
732 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733 video_extension = mediaURL[-3:]
735 # Extract gdaKey if available
736 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
740 gdaKey = mobj.group(1)
741 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
743 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
745 raise ExtractorError(u'Unable to extract media URL')
746 vardict = compat_parse_qs(mobj.group(1))
747 if 'mediaData' not in vardict:
748 raise ExtractorError(u'Unable to extract media URL')
749 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
751 raise ExtractorError(u'Unable to extract media URL')
752 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753 video_extension = mediaURL[-3:]
754 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
756 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
758 raise ExtractorError(u'Unable to extract title')
759 video_title = mobj.group(1).decode('utf-8')
761 mobj = re.search(r'submitter=(.*?);', webpage)
763 raise ExtractorError(u'Unable to extract uploader nickname')
764 video_uploader = mobj.group(1)
767 'id': video_id.decode('utf-8'),
768 'url': video_url.decode('utf-8'),
769 'uploader': video_uploader.decode('utf-8'),
771 'title': video_title,
772 'ext': video_extension.decode('utf-8'),
775 class DailymotionIE(InfoExtractor):
776 """Information Extractor for Dailymotion"""
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
781 def _real_extract(self, url):
782 # Extract id and simplified title from URL
783 mobj = re.match(self._VALID_URL, url)
785 raise ExtractorError(u'Invalid URL: %s' % url)
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
789 video_extension = 'mp4'
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
794 webpage = self._download_webpage(request, video_id)
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
800 raise ExtractorError(u'Unable to extract media URL')
801 flashvars = compat_urllib_parse.unquote(mobj.group(1))
803 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
806 self.to_screen(u'Using %s' % key)
809 raise ExtractorError(u'Unable to extract video URL')
811 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
813 raise ExtractorError(u'Unable to extract video URL')
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
817 # TODO: support choosing qualities
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
821 raise ExtractorError(u'Unable to extract title')
822 video_title = unescapeHTML(mobj.group('title'))
824 video_uploader = None
825 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
827 # lookin for official user
828 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829 if mobj_official is None:
830 self._downloader.report_warning(u'unable to extract uploader nickname')
832 video_uploader = mobj_official.group(1)
834 video_uploader = mobj.group(1)
836 video_upload_date = None
837 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
839 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
844 'uploader': video_uploader,
845 'upload_date': video_upload_date,
846 'title': video_title,
847 'ext': video_extension,
851 class PhotobucketIE(InfoExtractor):
852 """Information extractor for photobucket.com."""
854 # TODO: the original _VALID_URL was:
855 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 # Check if it's necessary to keep the old extracion process
857 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858 IE_NAME = u'photobucket'
860 def _real_extract(self, url):
861 # Extract id from URL
862 mobj = re.match(self._VALID_URL, url)
864 raise ExtractorError(u'Invalid URL: %s' % url)
866 video_id = mobj.group('id')
868 video_extension = mobj.group('ext')
870 # Retrieve video webpage to extract further information
871 webpage = self._download_webpage(url, video_id)
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
875 # We try first by looking the javascript code:
876 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
878 info = json.loads(mobj.group('json'))
881 'url': info[u'downloadUrl'],
882 'uploader': info[u'username'],
883 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884 'title': info[u'title'],
885 'ext': video_extension,
886 'thumbnail': info[u'thumbUrl'],
889 # We try looking in other parts of the webpage
890 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
892 raise ExtractorError(u'Unable to extract media URL')
893 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
897 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
899 raise ExtractorError(u'Unable to extract title')
900 video_title = mobj.group(1).decode('utf-8')
902 video_uploader = mobj.group(2).decode('utf-8')
905 'id': video_id.decode('utf-8'),
906 'url': video_url.decode('utf-8'),
907 'uploader': video_uploader,
909 'title': video_title,
910 'ext': video_extension.decode('utf-8'),
914 class YahooIE(InfoExtractor):
915 """Information extractor for screen.yahoo.com."""
916 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
918 def _real_extract(self, url):
919 mobj = re.match(self._VALID_URL, url)
921 raise ExtractorError(u'Invalid URL: %s' % url)
922 video_id = mobj.group('id')
923 webpage = self._download_webpage(url, video_id)
924 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
927 # TODO: Check which url parameters are required
928 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
929 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
930 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
931 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
932 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
933 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
935 self.report_extraction(video_id)
936 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
938 raise ExtractorError(u'Unable to extract video info')
939 video_title = m_info.group('title')
940 video_description = m_info.group('description')
941 video_thumb = m_info.group('thumb')
942 video_date = m_info.group('date')
943 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
945 # TODO: Find a way to get mp4 videos
946 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
947 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
948 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
949 video_url = m_rest.group('url')
950 video_path = m_rest.group('path')
952 raise ExtractorError(u'Unable to extract video url')
954 else: # We have to use a different method if another id is defined
955 long_id = m_id.group('new_id')
956 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
957 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
958 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
959 info = json.loads(json_str)
960 res = info[u'query'][u'results'][u'mediaObj'][0]
961 stream = res[u'streams'][0]
962 video_path = stream[u'path']
963 video_url = stream[u'host']
965 video_title = meta[u'title']
966 video_description = meta[u'description']
967 video_thumb = meta[u'thumbnail']
968 video_date = None # I can't find it
973 'play_path': video_path,
975 'description': video_description,
976 'thumbnail': video_thumb,
977 'upload_date': video_date,
982 class VimeoIE(InfoExtractor):
983 """Information extractor for vimeo.com."""
985 # _VALID_URL matches Vimeo URLs
986 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
993 raise ExtractorError(u'Invalid URL: %s' % url)
995 video_id = mobj.group('id')
996 if not mobj.group('proto'):
997 url = 'https://' + url
998 if mobj.group('direct_link'):
999 url = 'https://vimeo.com/' + video_id
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url, None, std_headers)
1003 webpage = self._download_webpage(request, video_id)
1005 # Now we begin extracting as much information as we can from what we
1006 # retrieved. First we extract the information common to all extractors,
1007 # and latter we extract those that are Vimeo specific.
1008 self.report_extraction(video_id)
1010 # Extract the config JSON
1012 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013 config = json.loads(config)
1015 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1016 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1018 raise ExtractorError(u'Unable to extract info section')
1021 video_title = config["video"]["title"]
1023 # Extract uploader and uploader_id
1024 video_uploader = config["video"]["owner"]["name"]
1025 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1027 # Extract video thumbnail
1028 video_thumbnail = config["video"]["thumbnail"]
1030 # Extract video description
1031 video_description = get_element_by_attribute("itemprop", "description", webpage)
1032 if video_description: video_description = clean_html(video_description)
1033 else: video_description = u''
1035 # Extract upload date
1036 video_upload_date = None
1037 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038 if mobj is not None:
1039 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1041 # Vimeo specific: extract request signature and timestamp
1042 sig = config['request']['signature']
1043 timestamp = config['request']['timestamp']
1045 # Vimeo specific: extract video codec and quality information
1046 # First consider quality, then codecs, then take everything
1047 # TODO bind to format param
1048 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049 files = { 'hd': [], 'sd': [], 'other': []}
1050 for codec_name, codec_extension in codecs:
1051 if codec_name in config["video"]["files"]:
1052 if 'hd' in config["video"]["files"][codec_name]:
1053 files['hd'].append((codec_name, codec_extension, 'hd'))
1054 elif 'sd' in config["video"]["files"][codec_name]:
1055 files['sd'].append((codec_name, codec_extension, 'sd'))
1057 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1059 for quality in ('hd', 'sd', 'other'):
1060 if len(files[quality]) > 0:
1061 video_quality = files[quality][0][2]
1062 video_codec = files[quality][0][0]
1063 video_extension = files[quality][0][1]
1064 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1067 raise ExtractorError(u'No known codec found')
1069 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1075 'uploader': video_uploader,
1076 'uploader_id': video_uploader_id,
1077 'upload_date': video_upload_date,
1078 'title': video_title,
1079 'ext': video_extension,
1080 'thumbnail': video_thumbnail,
1081 'description': video_description,
1085 class ArteTvIE(InfoExtractor):
1086 """arte.tv information extractor."""
1088 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089 _LIVE_URL = r'index-[0-9]+\.html$'
1091 IE_NAME = u'arte.tv'
1093 def fetch_webpage(self, url):
1094 request = compat_urllib_request.Request(url)
1096 self.report_download_webpage(url)
1097 webpage = compat_urllib_request.urlopen(request).read()
1098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1100 except ValueError as err:
1101 raise ExtractorError(u'Invalid URL: %s' % url)
1104 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105 page = self.fetch_webpage(url)
1106 mobj = re.search(regex, page, regexFlags)
1110 raise ExtractorError(u'Invalid URL: %s' % url)
1112 for (i, key, err) in matchTuples:
1113 if mobj.group(i) is None:
1114 raise ExtractorError(err)
1116 info[key] = mobj.group(i)
1120 def extractLiveStream(self, url):
1121 video_lang = url.split('/')[-4]
1122 info = self.grep_webpage(
1124 r'src="(.*?/videothek_js.*?\.js)',
1127 (1, 'url', u'Invalid URL: %s' % url)
1130 http_host = url.split('/')[2]
1131 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132 info = self.grep_webpage(
1134 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135 '(http://.*?\.swf).*?' +
1139 (1, 'path', u'could not extract video path: %s' % url),
1140 (2, 'player', u'could not extract video player: %s' % url),
1141 (3, 'url', u'could not extract video url: %s' % url)
1144 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1146 def extractPlus7Stream(self, url):
1147 video_lang = url.split('/')[-3]
1148 info = self.grep_webpage(
1150 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1153 (1, 'url', u'Invalid URL: %s' % url)
1156 next_url = compat_urllib_parse.unquote(info.get('url'))
1157 info = self.grep_webpage(
1159 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1162 (1, 'url', u'Could not find <video> tag: %s' % url)
1165 next_url = compat_urllib_parse.unquote(info.get('url'))
1167 info = self.grep_webpage(
1169 r'<video id="(.*?)".*?>.*?' +
1170 '<name>(.*?)</name>.*?' +
1171 '<dateVideo>(.*?)</dateVideo>.*?' +
1172 '<url quality="hd">(.*?)</url>',
1175 (1, 'id', u'could not extract video id: %s' % url),
1176 (2, 'title', u'could not extract video title: %s' % url),
1177 (3, 'date', u'could not extract video date: %s' % url),
1178 (4, 'url', u'could not extract video url: %s' % url)
1183 'id': info.get('id'),
1184 'url': compat_urllib_parse.unquote(info.get('url')),
1185 'uploader': u'arte.tv',
1186 'upload_date': unified_strdate(info.get('date')),
1187 'title': info.get('title').decode('utf-8'),
1193 def _real_extract(self, url):
1194 video_id = url.split('/')[-1]
1195 self.report_extraction(video_id)
1197 if re.search(self._LIVE_URL, video_id) is not None:
1198 self.extractLiveStream(url)
1201 info = self.extractPlus7Stream(url)
1206 class GenericIE(InfoExtractor):
1207 """Generic last-resort information extractor."""
1210 IE_NAME = u'generic'
1212 def report_download_webpage(self, video_id):
1213 """Report webpage download."""
1214 if not self._downloader.params.get('test', False):
1215 self._downloader.report_warning(u'Falling back on generic information extractor.')
1216 super(GenericIE, self).report_download_webpage(video_id)
1218 def report_following_redirect(self, new_url):
1219 """Report information extraction."""
1220 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1222 def _test_redirect(self, url):
1223 """Check if it is a redirect, like url shorteners, in case return the new url."""
1224 class HeadRequest(compat_urllib_request.Request):
1225 def get_method(self):
1228 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1230 Subclass the HTTPRedirectHandler to make it use our
1231 HeadRequest also on the redirected URL
1233 def redirect_request(self, req, fp, code, msg, headers, newurl):
1234 if code in (301, 302, 303, 307):
1235 newurl = newurl.replace(' ', '%20')
1236 newheaders = dict((k,v) for k,v in req.headers.items()
1237 if k.lower() not in ("content-length", "content-type"))
1238 return HeadRequest(newurl,
1240 origin_req_host=req.get_origin_req_host(),
1243 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1245 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1247 Fallback to GET if HEAD is not allowed (405 HTTP error)
1249 def http_error_405(self, req, fp, code, msg, headers):
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1257 origin_req_host=req.get_origin_req_host(),
1261 opener = compat_urllib_request.OpenerDirector()
1262 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263 HTTPMethodFallback, HEADRedirectHandler,
1264 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1265 opener.add_handler(handler())
1267 response = opener.open(HeadRequest(url))
1268 new_url = response.geturl()
1273 self.report_following_redirect(new_url)
1276 def _real_extract(self, url):
1277 new_url = self._test_redirect(url)
1278 if new_url: return [self.url_result(new_url)]
1280 video_id = url.split('/')[-1]
1282 webpage = self._download_webpage(url, video_id)
1283 except ValueError as err:
1284 # since this is the last-resort InfoExtractor, if
1285 # this error is thrown, it'll be thrown here
1286 raise ExtractorError(u'Invalid URL: %s' % url)
1288 self.report_extraction(video_id)
1289 # Start with something easy: JW Player in SWFObject
1290 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1292 # Broaden the search a little bit
1293 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1295 # Broaden the search a little bit: JWPlayer JS loader
1296 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1298 raise ExtractorError(u'Invalid URL: %s' % url)
1300 # It's possible that one of the regexes
1301 # matched, but returned an empty group:
1302 if mobj.group(1) is None:
1303 raise ExtractorError(u'Invalid URL: %s' % url)
1305 video_url = compat_urllib_parse.unquote(mobj.group(1))
1306 video_id = os.path.basename(video_url)
1308 # here's a fun little line of code for you:
1309 video_extension = os.path.splitext(video_id)[1][1:]
1310 video_id = os.path.splitext(video_id)[0]
1312 # it's tempting to parse this further, but you would
1313 # have to take into account all the variations like
1314 # Video Title - Site Name
1315 # Site Name | Video Title
1316 # Video Title - Tagline | Site Name
1317 # and so on and so forth; it's just not practical
1318 mobj = re.search(r'<title>(.*)</title>', webpage)
1320 raise ExtractorError(u'Unable to extract title')
1321 video_title = mobj.group(1)
1323 # video uploader is domain name
1324 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1326 raise ExtractorError(u'Unable to extract title')
1327 video_uploader = mobj.group(1)
1332 'uploader': video_uploader,
1333 'upload_date': None,
1334 'title': video_title,
1335 'ext': video_extension,
1339 class YoutubeSearchIE(InfoExtractor):
1340 """Information Extractor for YouTube search queries."""
1341 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343 _max_youtube_results = 1000
1344 IE_NAME = u'youtube:search'
1346 def report_download_page(self, query, pagenum):
1347 """Report attempt to download search page with given number."""
1348 query = query.decode(preferredencoding())
1349 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1351 def _real_extract(self, query):
1352 mobj = re.match(self._VALID_URL, query)
1354 raise ExtractorError(u'Invalid search query "%s"' % query)
1356 prefix, query = query.split(':')
1358 query = query.encode('utf-8')
1360 return self._get_n_results(query, 1)
1361 elif prefix == 'all':
1362 self._get_n_results(query, self._max_youtube_results)
1367 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1368 elif n > self._max_youtube_results:
1369 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370 n = self._max_youtube_results
1371 return self._get_n_results(query, n)
1372 except ValueError: # parsing prefix as integer fails
1373 return self._get_n_results(query, 1)
1375 def _get_n_results(self, query, n):
1376 """Get a specified number of results for a query"""
1382 while (50 * pagenum) < limit:
1383 self.report_download_page(query, pagenum+1)
1384 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385 request = compat_urllib_request.Request(result_url)
1387 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1389 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1390 api_response = json.loads(data)['data']
1392 if not 'items' in api_response:
1393 raise ExtractorError(u'[youtube] No video results')
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1398 limit = min(n, api_response['totalItems'])
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
1403 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1407 class GoogleSearchIE(InfoExtractor):
1408 """Information Extractor for Google Video search queries."""
1409 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1410 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1411 _max_google_results = 1000
1412 IE_NAME = u'video.google:search'
1414 def _real_extract(self, query):
1415 mobj = re.match(self._VALID_URL, query)
1417 prefix = mobj.group('prefix')
1418 query = mobj.group('query')
1420 return self._get_n_results(query, 1)
1421 elif prefix == 'all':
1422 return self._get_n_results(query, self._max_google_results)
1426 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427 elif n > self._max_google_results:
1428 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429 n = self._max_google_results
1430 return self._get_n_results(query, n)
1432 def _get_n_results(self, query, n):
1433 """Get a specified number of results for a query"""
1436 '_type': 'playlist',
1441 for pagenum in itertools.count(1):
1442 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1444 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1445 note='Downloading result page ' + str(pagenum))
1447 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1450 'url': mobj.group(1)
1452 res['entries'].append(e)
1454 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1457 class YahooSearchIE(InfoExtractor):
1458 """Information Extractor for Yahoo! Video search queries."""
1460 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1462 _max_yahoo_results = 1000
1463 IE_NAME = u'screen.yahoo:search'
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1468 raise ExtractorError(u'Invalid search query "%s"' % query)
1470 prefix, query = query.split(':')
1472 query = query.encode('utf-8')
1474 return self._get_n_results(query, 1)
1475 elif prefix == 'all':
1476 return self._get_n_results(query, self._max_yahoo_results)
1481 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1482 elif n > self._max_yahoo_results:
1483 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1484 n = self._max_yahoo_results
1485 return self._get_n_results(query, n)
1486 except ValueError: # parsing prefix as integer fails
1487 return self._get_n_results(query, 1)
1489 def _get_n_results(self, query, n):
1490 """Get a specified number of results for a query"""
1493 '_type': 'playlist',
1497 for pagenum in itertools.count(0):
1498 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1499 webpage = self._download_webpage(result_url, query,
1500 note='Downloading results page '+str(pagenum+1))
1501 info = json.loads(webpage)
1503 results = info[u'results']
1505 for (i, r) in enumerate(results):
1506 if (pagenum * 30) +i >= n:
1508 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1509 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1510 res['entries'].append(e)
1511 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1517 class YoutubePlaylistIE(InfoExtractor):
1518 """Information Extractor for YouTube playlists."""
1520 _VALID_URL = r"""(?:
1525 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1526 \? (?:.*?&)*? (?:p|a|list)=
1529 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1532 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1534 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1536 IE_NAME = u'youtube:playlist'
1539 def suitable(cls, url):
1540 """Receives a URL and returns True if suitable for this IE."""
1541 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1543 def _real_extract(self, url):
1544 # Extract playlist id
1545 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1547 raise ExtractorError(u'Invalid URL: %s' % url)
1549 # Download playlist videos from API
1550 playlist_id = mobj.group(1) or mobj.group(2)
1555 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1556 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1559 response = json.loads(page)
1560 except ValueError as err:
1561 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1563 if 'feed' not in response:
1564 raise ExtractorError(u'Got a malformed response from YouTube API')
1565 playlist_title = response['feed']['title']['$t']
1566 if 'entry' not in response['feed']:
1567 # Number of videos is a multiple of self._MAX_RESULTS
1570 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1571 for entry in response['feed']['entry']
1572 if 'content' in entry ]
1574 if len(response['feed']['entry']) < self._MAX_RESULTS:
1578 videos = [v[1] for v in sorted(videos)]
1580 url_results = [self.url_result(url, 'Youtube') for url in videos]
1581 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1584 class YoutubeChannelIE(InfoExtractor):
1585 """Information Extractor for YouTube channels."""
1587 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1588 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1589 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1590 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1591 IE_NAME = u'youtube:channel'
1593 def extract_videos_from_page(self, page):
1595 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1596 if mobj.group(1) not in ids_in_page:
1597 ids_in_page.append(mobj.group(1))
1600 def _real_extract(self, url):
1601 # Extract channel id
1602 mobj = re.match(self._VALID_URL, url)
1604 raise ExtractorError(u'Invalid URL: %s' % url)
1606 # Download channel page
1607 channel_id = mobj.group(1)
1611 url = self._TEMPLATE_URL % (channel_id, pagenum)
1612 page = self._download_webpage(url, channel_id,
1613 u'Downloading page #%s' % pagenum)
1615 # Extract video identifiers
1616 ids_in_page = self.extract_videos_from_page(page)
1617 video_ids.extend(ids_in_page)
1619 # Download any subsequent channel pages using the json-based channel_ajax query
1620 if self._MORE_PAGES_INDICATOR in page:
1622 pagenum = pagenum + 1
1624 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1625 page = self._download_webpage(url, channel_id,
1626 u'Downloading page #%s' % pagenum)
1628 page = json.loads(page)
1630 ids_in_page = self.extract_videos_from_page(page['content_html'])
1631 video_ids.extend(ids_in_page)
1633 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1636 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1638 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1639 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1640 return [self.playlist_result(url_entries, channel_id)]
1643 class YoutubeUserIE(InfoExtractor):
1644 """Information Extractor for YouTube users."""
1646 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1647 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1648 _GDATA_PAGE_SIZE = 50
1649 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1650 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1651 IE_NAME = u'youtube:user'
1653 def _real_extract(self, url):
1655 mobj = re.match(self._VALID_URL, url)
1657 raise ExtractorError(u'Invalid URL: %s' % url)
1659 username = mobj.group(1)
1661 # Download video ids using YouTube Data API. Result size per
1662 # query is limited (currently to 50 videos) so we need to query
1663 # page by page until there are no video ids - it means we got
1670 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1672 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1673 page = self._download_webpage(gdata_url, username,
1674 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1676 # Extract video identifiers
1679 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1680 if mobj.group(1) not in ids_in_page:
1681 ids_in_page.append(mobj.group(1))
1683 video_ids.extend(ids_in_page)
1685 # A little optimization - if current page is not
1686 # "full", ie. does not contain PAGE_SIZE video ids then
1687 # we can assume that this page is the last one - there
1688 # are no more ids on further pages - no need to query
1691 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1696 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1697 url_results = [self.url_result(url, 'Youtube') for url in urls]
1698 return [self.playlist_result(url_results, playlist_title = username)]
1701 class BlipTVUserIE(InfoExtractor):
1702 """Information Extractor for blip.tv users."""
1704 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1706 IE_NAME = u'blip.tv:user'
1708 def _real_extract(self, url):
1710 mobj = re.match(self._VALID_URL, url)
1712 raise ExtractorError(u'Invalid URL: %s' % url)
1714 username = mobj.group(1)
1716 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1718 page = self._download_webpage(url, username, u'Downloading user page')
1719 mobj = re.search(r'data-users-id="([^"]+)"', page)
1720 page_base = page_base % mobj.group(1)
1723 # Download video ids using BlipTV Ajax calls. Result size per
1724 # query is limited (currently to 12 videos) so we need to query
1725 # page by page until there are no video ids - it means we got
1732 url = page_base + "&page=" + str(pagenum)
1733 page = self._download_webpage(url, username,
1734 u'Downloading video ids from page %d' % pagenum)
1736 # Extract video identifiers
1739 for mobj in re.finditer(r'href="/([^"]+)"', page):
1740 if mobj.group(1) not in ids_in_page:
1741 ids_in_page.append(unescapeHTML(mobj.group(1)))
1743 video_ids.extend(ids_in_page)
1745 # A little optimization - if current page is not
1746 # "full", ie. does not contain PAGE_SIZE video ids then
1747 # we can assume that this page is the last one - there
1748 # are no more ids on further pages - no need to query
1751 if len(ids_in_page) < self._PAGE_SIZE:
1756 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1757 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1758 return [self.playlist_result(url_entries, playlist_title = username)]
1761 class DepositFilesIE(InfoExtractor):
1762 """Information extractor for depositfiles.com"""
1764 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1766 def _real_extract(self, url):
1767 file_id = url.split('/')[-1]
1768 # Rebuild url in english locale
1769 url = 'http://depositfiles.com/en/files/' + file_id
1771 # Retrieve file webpage with 'Free download' button pressed
1772 free_download_indication = { 'gateway_result' : '1' }
1773 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1775 self.report_download_webpage(file_id)
1776 webpage = compat_urllib_request.urlopen(request).read()
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1780 # Search for the real file URL
1781 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1782 if (mobj is None) or (mobj.group(1) is None):
1783 # Try to figure out reason of the error.
1784 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1785 if (mobj is not None) and (mobj.group(1) is not None):
1786 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1787 raise ExtractorError(u'%s' % restriction_message)
1789 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1791 file_url = mobj.group(1)
1792 file_extension = os.path.splitext(file_url)[1][1:]
1794 # Search for file title
1795 mobj = re.search(r'<b title="(.*?)">', webpage)
1797 raise ExtractorError(u'Unable to extract title')
1798 file_title = mobj.group(1).decode('utf-8')
1801 'id': file_id.decode('utf-8'),
1802 'url': file_url.decode('utf-8'),
1804 'upload_date': None,
1805 'title': file_title,
1806 'ext': file_extension.decode('utf-8'),
1810 class FacebookIE(InfoExtractor):
1811 """Information Extractor for Facebook"""
1813 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1814 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1815 _NETRC_MACHINE = 'facebook'
1816 IE_NAME = u'facebook'
1818 def report_login(self):
1819 """Report attempt to log in."""
1820 self.to_screen(u'Logging in')
1822 def _real_initialize(self):
1823 if self._downloader is None:
1828 downloader_params = self._downloader.params
1830 # Attempt to use provided username and password or .netrc data
1831 if downloader_params.get('username', None) is not None:
1832 useremail = downloader_params['username']
1833 password = downloader_params['password']
1834 elif downloader_params.get('usenetrc', False):
1836 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1837 if info is not None:
1841 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1842 except (IOError, netrc.NetrcParseError) as err:
1843 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1846 if useremail is None:
1855 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1858 login_results = compat_urllib_request.urlopen(request).read()
1859 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1860 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1862 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1863 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1866 def _real_extract(self, url):
1867 mobj = re.match(self._VALID_URL, url)
1869 raise ExtractorError(u'Invalid URL: %s' % url)
1870 video_id = mobj.group('ID')
1872 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1873 webpage = self._download_webpage(url, video_id)
1875 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1876 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1877 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1879 raise ExtractorError(u'Cannot parse data')
1880 data = dict(json.loads(m.group(1)))
1881 params_raw = compat_urllib_parse.unquote(data['params'])
1882 params = json.loads(params_raw)
1883 video_data = params['video_data'][0]
1884 video_url = video_data.get('hd_src')
1886 video_url = video_data['sd_src']
1888 raise ExtractorError(u'Cannot find video URL')
1889 video_duration = int(video_data['video_duration'])
1890 thumbnail = video_data['thumbnail_src']
1892 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1894 raise ExtractorError(u'Cannot find title in webpage')
1895 video_title = unescapeHTML(m.group(1))
1899 'title': video_title,
1902 'duration': video_duration,
1903 'thumbnail': thumbnail,
1908 class BlipTVIE(InfoExtractor):
1909 """Information extractor for blip.tv"""
1911 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1912 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1913 IE_NAME = u'blip.tv'
1915 def report_direct_download(self, title):
1916 """Report information extraction."""
1917 self.to_screen(u'%s: Direct download detected' % title)
1919 def _real_extract(self, url):
1920 mobj = re.match(self._VALID_URL, url)
1922 raise ExtractorError(u'Invalid URL: %s' % url)
1924 urlp = compat_urllib_parse_urlparse(url)
1925 if urlp.path.startswith('/play/'):
1926 request = compat_urllib_request.Request(url)
1927 response = compat_urllib_request.urlopen(request)
1928 redirecturl = response.geturl()
1929 rurlp = compat_urllib_parse_urlparse(redirecturl)
1930 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1931 url = 'http://blip.tv/a/a-' + file_id
1932 return self._real_extract(url)
1939 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1940 request = compat_urllib_request.Request(json_url)
1941 request.add_header('User-Agent', 'iTunes/10.6.1')
1942 self.report_extraction(mobj.group(1))
1945 urlh = compat_urllib_request.urlopen(request)
1946 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1947 basename = url.split('/')[-1]
1948 title,ext = os.path.splitext(basename)
1949 title = title.decode('UTF-8')
1950 ext = ext.replace('.', '')
1951 self.report_direct_download(title)
1956 'upload_date': None,
1961 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1962 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1963 if info is None: # Regular URL
1965 json_code_bytes = urlh.read()
1966 json_code = json_code_bytes.decode('utf-8')
1967 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1968 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1971 json_data = json.loads(json_code)
1972 if 'Post' in json_data:
1973 data = json_data['Post']
1977 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1978 video_url = data['media']['url']
1979 umobj = re.match(self._URL_EXT, video_url)
1981 raise ValueError('Can not determine filename extension')
1982 ext = umobj.group(1)
1985 'id': data['item_id'],
1987 'uploader': data['display_name'],
1988 'upload_date': upload_date,
1989 'title': data['title'],
1991 'format': data['media']['mimeType'],
1992 'thumbnail': data['thumbnailUrl'],
1993 'description': data['description'],
1994 'player_url': data['embedUrl'],
1995 'user_agent': 'iTunes/10.6.1',
1997 except (ValueError,KeyError) as err:
1998 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2003 class MyVideoIE(InfoExtractor):
2004 """Information Extractor for myvideo.de."""
2006 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2007 IE_NAME = u'myvideo'
2009 def _real_extract(self,url):
2010 mobj = re.match(self._VALID_URL, url)
2012 raise ExtractorError(u'Invalid URL: %s' % url)
2014 video_id = mobj.group(1)
2017 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2018 webpage = self._download_webpage(webpage_url, video_id)
2020 self.report_extraction(video_id)
2021 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2024 raise ExtractorError(u'Unable to extract media URL')
2025 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2027 mobj = re.search('<title>([^<]+)</title>', webpage)
2029 raise ExtractorError(u'Unable to extract title')
2031 video_title = mobj.group(1)
2037 'upload_date': None,
2038 'title': video_title,
2042 class ComedyCentralIE(InfoExtractor):
2043 """Information extractor for The Daily Show and Colbert Report """
2045 # urls can be abbreviations like :thedailyshow or :colbert
2046 # urls for episodes like:
2047 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2048 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2049 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2050 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2051 |(https?://)?(www\.)?
2052 (?P<showname>thedailyshow|colbertnation)\.com/
2053 (full-episodes/(?P<episode>.*)|
2055 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2056 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2059 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2061 _video_extensions = {
2069 _video_dimensions = {
2079 def suitable(cls, url):
2080 """Receives a URL and returns True if suitable for this IE."""
2081 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2083 def _print_formats(self, formats):
2084 print('Available formats:')
2086 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2089 def _real_extract(self, url):
2090 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2092 raise ExtractorError(u'Invalid URL: %s' % url)
2094 if mobj.group('shortname'):
2095 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2096 url = u'http://www.thedailyshow.com/full-episodes/'
2098 url = u'http://www.colbertnation.com/full-episodes/'
2099 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2100 assert mobj is not None
2102 if mobj.group('clip'):
2103 if mobj.group('showname') == 'thedailyshow':
2104 epTitle = mobj.group('tdstitle')
2106 epTitle = mobj.group('cntitle')
2109 dlNewest = not mobj.group('episode')
2111 epTitle = mobj.group('showname')
2113 epTitle = mobj.group('episode')
2115 self.report_extraction(epTitle)
2116 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2118 url = htmlHandle.geturl()
2119 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2121 raise ExtractorError(u'Invalid redirected URL: ' + url)
2122 if mobj.group('episode') == '':
2123 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2124 epTitle = mobj.group('episode')
2126 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2128 if len(mMovieParams) == 0:
2129 # The Colbert Report embeds the information in a without
2130 # a URL prefix; so extract the alternate reference
2131 # and then add the URL prefix manually.
2133 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2134 if len(altMovieParams) == 0:
2135 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2137 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2139 uri = mMovieParams[0][1]
2140 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2141 indexXml = self._download_webpage(indexUrl, epTitle,
2142 u'Downloading show index',
2143 u'unable to download episode index')
2147 idoc = xml.etree.ElementTree.fromstring(indexXml)
2148 itemEls = idoc.findall('.//item')
2149 for partNum,itemEl in enumerate(itemEls):
2150 mediaId = itemEl.findall('./guid')[0].text
2151 shortMediaId = mediaId.split(':')[-1]
2152 showId = mediaId.split(':')[-2].replace('.com', '')
2153 officialTitle = itemEl.findall('./title')[0].text
2154 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2156 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2157 compat_urllib_parse.urlencode({'uri': mediaId}))
2158 configXml = self._download_webpage(configUrl, epTitle,
2159 u'Downloading configuration for %s' % shortMediaId)
2161 cdoc = xml.etree.ElementTree.fromstring(configXml)
2163 for rendition in cdoc.findall('.//rendition'):
2164 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2168 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2171 if self._downloader.params.get('listformats', None):
2172 self._print_formats([i[0] for i in turls])
2175 # For now, just pick the highest bitrate
2176 format,rtmp_video_url = turls[-1]
2178 # Get the format arg from the arg stream
2179 req_format = self._downloader.params.get('format', None)
2181 # Select format if we can find one
2184 format, rtmp_video_url = f, v
2187 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2189 raise ExtractorError(u'Cannot transform RTMP url')
2190 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2191 video_url = base + m.group('finalid')
2193 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2198 'upload_date': officialDate,
2203 'description': officialTitle,
2205 results.append(info)
2210 class EscapistIE(InfoExtractor):
2211 """Information extractor for The Escapist """
2213 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2214 IE_NAME = u'escapist'
2216 def _real_extract(self, url):
2217 mobj = re.match(self._VALID_URL, url)
2219 raise ExtractorError(u'Invalid URL: %s' % url)
2220 showName = mobj.group('showname')
2221 videoId = mobj.group('episode')
2223 self.report_extraction(showName)
2224 webPage = self._download_webpage(url, showName)
2226 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2227 description = unescapeHTML(descMatch.group(1))
2228 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2229 imgUrl = unescapeHTML(imgMatch.group(1))
2230 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2231 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2232 configUrlMatch = re.search('config=(.*)$', playerUrl)
2233 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2235 configJSON = self._download_webpage(configUrl, showName,
2236 u'Downloading configuration',
2237 u'unable to download configuration')
2239 # Technically, it's JavaScript, not JSON
2240 configJSON = configJSON.replace("'", '"')
2243 config = json.loads(configJSON)
2244 except (ValueError,) as err:
2245 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2247 playlist = config['playlist']
2248 videoUrl = playlist[1]['url']
2253 'uploader': showName,
2254 'upload_date': None,
2257 'thumbnail': imgUrl,
2258 'description': description,
2259 'player_url': playerUrl,
2264 class CollegeHumorIE(InfoExtractor):
2265 """Information extractor for collegehumor.com"""
2268 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2269 IE_NAME = u'collegehumor'
2271 def report_manifest(self, video_id):
2272 """Report information extraction."""
2273 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2275 def _real_extract(self, url):
2276 mobj = re.match(self._VALID_URL, url)
2278 raise ExtractorError(u'Invalid URL: %s' % url)
2279 video_id = mobj.group('videoid')
2284 'upload_date': None,
2287 self.report_extraction(video_id)
2288 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2290 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2292 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2294 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2296 videoNode = mdoc.findall('./video')[0]
2297 info['description'] = videoNode.findall('./description')[0].text
2298 info['title'] = videoNode.findall('./caption')[0].text
2299 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2300 manifest_url = videoNode.findall('./file')[0].text
2302 raise ExtractorError(u'Invalid metadata XML file')
2304 manifest_url += '?hdcore=2.10.3'
2305 self.report_manifest(video_id)
2307 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2308 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2309 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2311 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2313 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2314 node_id = media_node.attrib['url']
2315 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2316 except IndexError as err:
2317 raise ExtractorError(u'Invalid manifest file')
2319 url_pr = compat_urllib_parse_urlparse(manifest_url)
2320 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2327 class XVideosIE(InfoExtractor):
2328 """Information extractor for xvideos.com"""
2330 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2331 IE_NAME = u'xvideos'
2333 def _real_extract(self, url):
2334 mobj = re.match(self._VALID_URL, url)
2336 raise ExtractorError(u'Invalid URL: %s' % url)
2337 video_id = mobj.group(1)
2339 webpage = self._download_webpage(url, video_id)
2341 self.report_extraction(video_id)
2345 mobj = re.search(r'flv_url=(.+?)&', webpage)
2347 raise ExtractorError(u'Unable to extract video url')
2348 video_url = compat_urllib_parse.unquote(mobj.group(1))
2352 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2354 raise ExtractorError(u'Unable to extract video title')
2355 video_title = mobj.group(1)
2358 # Extract video thumbnail
2359 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2361 raise ExtractorError(u'Unable to extract video thumbnail')
2362 video_thumbnail = mobj.group(0)
2368 'upload_date': None,
2369 'title': video_title,
2371 'thumbnail': video_thumbnail,
2372 'description': None,
2378 class SoundcloudIE(InfoExtractor):
2379 """Information extractor for soundcloud.com
2380 To access the media, the uid of the song and a stream token
2381 must be extracted from the page source and the script must make
2382 a request to media.soundcloud.com/crossdomain.xml. Then
2383 the media can be grabbed by requesting from an url composed
2384 of the stream token and uid
2387 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2388 IE_NAME = u'soundcloud'
2390 def report_resolve(self, video_id):
2391 """Report information extraction."""
2392 self.to_screen(u'%s: Resolving id' % video_id)
2394 def _real_extract(self, url):
2395 mobj = re.match(self._VALID_URL, url)
2397 raise ExtractorError(u'Invalid URL: %s' % url)
2399 # extract uploader (which is in the url)
2400 uploader = mobj.group(1)
2401 # extract simple title (uploader + slug of song title)
2402 slug_title = mobj.group(2)
2403 simple_title = uploader + u'-' + slug_title
2404 full_title = '%s/%s' % (uploader, slug_title)
2406 self.report_resolve(full_title)
2408 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2409 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2410 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2412 info = json.loads(info_json)
2413 video_id = info['id']
2414 self.report_extraction(full_title)
2416 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2417 stream_json = self._download_webpage(streams_url, full_title,
2418 u'Downloading stream definitions',
2419 u'unable to download stream definitions')
2421 streams = json.loads(stream_json)
2422 mediaURL = streams['http_mp3_128_url']
2423 upload_date = unified_strdate(info['created_at'])
2428 'uploader': info['user']['username'],
2429 'upload_date': upload_date,
2430 'title': info['title'],
2432 'description': info['description'],
2435 class SoundcloudSetIE(InfoExtractor):
2436 """Information extractor for soundcloud.com sets
2437 To access the media, the uid of the song and a stream token
2438 must be extracted from the page source and the script must make
2439 a request to media.soundcloud.com/crossdomain.xml. Then
2440 the media can be grabbed by requesting from an url composed
2441 of the stream token and uid
2444 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2445 IE_NAME = u'soundcloud:set'
2447 def report_resolve(self, video_id):
2448 """Report information extraction."""
2449 self.to_screen(u'%s: Resolving id' % video_id)
2451 def _real_extract(self, url):
2452 mobj = re.match(self._VALID_URL, url)
2454 raise ExtractorError(u'Invalid URL: %s' % url)
2456 # extract uploader (which is in the url)
2457 uploader = mobj.group(1)
2458 # extract simple title (uploader + slug of song title)
2459 slug_title = mobj.group(2)
2460 simple_title = uploader + u'-' + slug_title
2461 full_title = '%s/sets/%s' % (uploader, slug_title)
2463 self.report_resolve(full_title)
2465 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2466 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2467 info_json = self._download_webpage(resolv_url, full_title)
2470 info = json.loads(info_json)
2471 if 'errors' in info:
2472 for err in info['errors']:
2473 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2476 self.report_extraction(full_title)
2477 for track in info['tracks']:
2478 video_id = track['id']
2480 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2481 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2483 self.report_extraction(video_id)
2484 streams = json.loads(stream_json)
2485 mediaURL = streams['http_mp3_128_url']
2490 'uploader': track['user']['username'],
2491 'upload_date': unified_strdate(track['created_at']),
2492 'title': track['title'],
2494 'description': track['description'],
2499 class InfoQIE(InfoExtractor):
2500 """Information extractor for infoq.com"""
2501 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2503 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2506 raise ExtractorError(u'Invalid URL: %s' % url)
2508 webpage = self._download_webpage(url, video_id=url)
2509 self.report_extraction(url)
2512 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2514 raise ExtractorError(u'Unable to extract video url')
2515 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2516 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2519 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2521 raise ExtractorError(u'Unable to extract video title')
2522 video_title = mobj.group(1)
2524 # Extract description
2525 video_description = u'No description available.'
2526 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2527 if mobj is not None:
2528 video_description = mobj.group(1)
2530 video_filename = video_url.split('/')[-1]
2531 video_id, extension = video_filename.split('.')
2537 'upload_date': None,
2538 'title': video_title,
2539 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2541 'description': video_description,
2546 class MixcloudIE(InfoExtractor):
2547 """Information extractor for www.mixcloud.com"""
2549 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2550 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2551 IE_NAME = u'mixcloud'
2553 def report_download_json(self, file_id):
2554 """Report JSON download."""
2555 self.to_screen(u'Downloading json')
2557 def get_urls(self, jsonData, fmt, bitrate='best'):
2558 """Get urls from 'audio_formats' section in json"""
2561 bitrate_list = jsonData[fmt]
2562 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2563 bitrate = max(bitrate_list) # select highest
2565 url_list = jsonData[fmt][bitrate]
2566 except TypeError: # we have no bitrate info.
2567 url_list = jsonData[fmt]
2570 def check_urls(self, url_list):
2571 """Returns 1st active url from list"""
2572 for url in url_list:
2574 compat_urllib_request.urlopen(url)
2576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2581 def _print_formats(self, formats):
2582 print('Available formats:')
2583 for fmt in formats.keys():
2584 for b in formats[fmt]:
2586 ext = formats[fmt][b][0]
2587 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2588 except TypeError: # we have no bitrate info
2589 ext = formats[fmt][0]
2590 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2593 def _real_extract(self, url):
2594 mobj = re.match(self._VALID_URL, url)
2596 raise ExtractorError(u'Invalid URL: %s' % url)
2597 # extract uploader & filename from url
2598 uploader = mobj.group(1).decode('utf-8')
2599 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2601 # construct API request
2602 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2603 # retrieve .json file with links to files
2604 request = compat_urllib_request.Request(file_url)
2606 self.report_download_json(file_url)
2607 jsonData = compat_urllib_request.urlopen(request).read()
2608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2609 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2612 json_data = json.loads(jsonData)
2613 player_url = json_data['player_swf_url']
2614 formats = dict(json_data['audio_formats'])
2616 req_format = self._downloader.params.get('format', None)
2619 if self._downloader.params.get('listformats', None):
2620 self._print_formats(formats)
2623 if req_format is None or req_format == 'best':
2624 for format_param in formats.keys():
2625 url_list = self.get_urls(formats, format_param)
2627 file_url = self.check_urls(url_list)
2628 if file_url is not None:
2631 if req_format not in formats:
2632 raise ExtractorError(u'Format is not available')
2634 url_list = self.get_urls(formats, req_format)
2635 file_url = self.check_urls(url_list)
2636 format_param = req_format
2639 'id': file_id.decode('utf-8'),
2640 'url': file_url.decode('utf-8'),
2641 'uploader': uploader.decode('utf-8'),
2642 'upload_date': None,
2643 'title': json_data['name'],
2644 'ext': file_url.split('.')[-1].decode('utf-8'),
2645 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2646 'thumbnail': json_data['thumbnail_url'],
2647 'description': json_data['description'],
2648 'player_url': player_url.decode('utf-8'),
2651 class StanfordOpenClassroomIE(InfoExtractor):
2652 """Information extractor for Stanford's Open ClassRoom"""
2654 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2655 IE_NAME = u'stanfordoc'
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2660 raise ExtractorError(u'Invalid URL: %s' % url)
2662 if mobj.group('course') and mobj.group('video'): # A specific video
2663 course = mobj.group('course')
2664 video = mobj.group('video')
2666 'id': course + '_' + video,
2668 'upload_date': None,
2671 self.report_extraction(info['id'])
2672 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2673 xmlUrl = baseUrl + video + '.xml'
2675 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2676 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2677 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2678 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2680 info['title'] = mdoc.findall('./title')[0].text
2681 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2683 raise ExtractorError(u'Invalid metadata XML file')
2684 info['ext'] = info['url'].rpartition('.')[2]
2686 elif mobj.group('course'): # A course page
2687 course = mobj.group('course')
2692 'upload_date': None,
2695 coursepage = self._download_webpage(url, info['id'],
2696 note='Downloading course info page',
2697 errnote='Unable to download course info page')
2699 m = re.search('<h1>([^<]+)</h1>', coursepage)
2701 info['title'] = unescapeHTML(m.group(1))
2703 info['title'] = info['id']
2705 m = re.search('<description>([^<]+)</description>', coursepage)
2707 info['description'] = unescapeHTML(m.group(1))
2709 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2712 'type': 'reference',
2713 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2717 for entry in info['list']:
2718 assert entry['type'] == 'reference'
2719 results += self.extract(entry['url'])
2723 'id': 'Stanford OpenClassroom',
2726 'upload_date': None,
2729 self.report_download_webpage(info['id'])
2730 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2732 rootpage = compat_urllib_request.urlopen(rootURL).read()
2733 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2734 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2736 info['title'] = info['id']
2738 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2741 'type': 'reference',
2742 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2747 for entry in info['list']:
2748 assert entry['type'] == 'reference'
2749 results += self.extract(entry['url'])
2752 class MTVIE(InfoExtractor):
2753 """Information extractor for MTV.com"""
2755 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2758 def _real_extract(self, url):
2759 mobj = re.match(self._VALID_URL, url)
2761 raise ExtractorError(u'Invalid URL: %s' % url)
2762 if not mobj.group('proto'):
2763 url = 'http://' + url
2764 video_id = mobj.group('videoid')
2766 webpage = self._download_webpage(url, video_id)
2768 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2770 raise ExtractorError(u'Unable to extract song name')
2771 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2772 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2774 raise ExtractorError(u'Unable to extract performer')
2775 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2776 video_title = performer + ' - ' + song_name
2778 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2780 raise ExtractorError(u'Unable to mtvn_uri')
2781 mtvn_uri = mobj.group(1)
2783 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2785 raise ExtractorError(u'Unable to extract content id')
2786 content_id = mobj.group(1)
2788 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2789 self.report_extraction(video_id)
2790 request = compat_urllib_request.Request(videogen_url)
2792 metadataXml = compat_urllib_request.urlopen(request).read()
2793 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2794 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2796 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2797 renditions = mdoc.findall('.//rendition')
2799 # For now, always pick the highest quality.
2800 rendition = renditions[-1]
2803 _,_,ext = rendition.attrib['type'].partition('/')
2804 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2805 video_url = rendition.find('./src').text
2807 raise ExtractorError('Invalid rendition field.')
2812 'uploader': performer,
2813 'upload_date': None,
2814 'title': video_title,
2822 class YoukuIE(InfoExtractor):
2823 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2826 nowTime = int(time.time() * 1000)
2827 random1 = random.randint(1000,1998)
2828 random2 = random.randint(1000,9999)
2830 return "%d%d%d" %(nowTime,random1,random2)
2832 def _get_file_ID_mix_string(self, seed):
2834 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2836 for i in range(len(source)):
2837 seed = (seed * 211 + 30031 ) % 65536
2838 index = math.floor(seed / 65536 * len(source) )
2839 mixed.append(source[int(index)])
2840 source.remove(source[int(index)])
2841 #return ''.join(mixed)
2844 def _get_file_id(self, fileId, seed):
2845 mixed = self._get_file_ID_mix_string(seed)
2846 ids = fileId.split('*')
2850 realId.append(mixed[int(ch)])
2851 return ''.join(realId)
2853 def _real_extract(self, url):
2854 mobj = re.match(self._VALID_URL, url)
2856 raise ExtractorError(u'Invalid URL: %s' % url)
2857 video_id = mobj.group('ID')
2859 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2861 jsondata = self._download_webpage(info_url, video_id)
2863 self.report_extraction(video_id)
2865 config = json.loads(jsondata)
2867 video_title = config['data'][0]['title']
2868 seed = config['data'][0]['seed']
2870 format = self._downloader.params.get('format', None)
2871 supported_format = list(config['data'][0]['streamfileids'].keys())
2873 if format is None or format == 'best':
2874 if 'hd2' in supported_format:
2879 elif format == 'worst':
2887 fileid = config['data'][0]['streamfileids'][format]
2888 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2889 except (UnicodeDecodeError, ValueError, KeyError):
2890 raise ExtractorError(u'Unable to extract info section')
2893 sid = self._gen_sid()
2894 fileid = self._get_file_id(fileid, seed)
2896 #column 8,9 of fileid represent the segment number
2897 #fileid[7:9] should be changed
2898 for index, key in enumerate(keys):
2900 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2901 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2904 'id': '%s_part%02d' % (video_id, index),
2905 'url': download_url,
2907 'upload_date': None,
2908 'title': video_title,
2911 files_info.append(info)
2916 class XNXXIE(InfoExtractor):
2917 """Information extractor for xnxx.com"""
2919 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2921 VIDEO_URL_RE = r'flv_url=(.*?)&'
2922 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2923 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2925 def _real_extract(self, url):
2926 mobj = re.match(self._VALID_URL, url)
2928 raise ExtractorError(u'Invalid URL: %s' % url)
2929 video_id = mobj.group(1)
2931 # Get webpage content
2932 webpage = self._download_webpage(url, video_id)
2934 result = re.search(self.VIDEO_URL_RE, webpage)
2936 raise ExtractorError(u'Unable to extract video url')
2937 video_url = compat_urllib_parse.unquote(result.group(1))
2939 result = re.search(self.VIDEO_TITLE_RE, webpage)
2941 raise ExtractorError(u'Unable to extract video title')
2942 video_title = result.group(1)
2944 result = re.search(self.VIDEO_THUMB_RE, webpage)
2946 raise ExtractorError(u'Unable to extract video thumbnail')
2947 video_thumbnail = result.group(1)
2953 'upload_date': None,
2954 'title': video_title,
2956 'thumbnail': video_thumbnail,
2957 'description': None,
2961 class GooglePlusIE(InfoExtractor):
2962 """Information extractor for plus.google.com."""
2964 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2965 IE_NAME = u'plus.google'
2967 def report_extract_entry(self, url):
2968 """Report downloading extry"""
2969 self.to_screen(u'Downloading entry: %s' % url)
2971 def report_date(self, upload_date):
2972 """Report downloading extry"""
2973 self.to_screen(u'Entry date: %s' % upload_date)
2975 def report_uploader(self, uploader):
2976 """Report downloading extry"""
2977 self.to_screen(u'Uploader: %s' % uploader)
2979 def report_title(self, video_title):
2980 """Report downloading extry"""
2981 self.to_screen(u'Title: %s' % video_title)
2983 def report_extract_vid_page(self, video_page):
2984 """Report information extraction."""
2985 self.to_screen(u'Extracting video page: %s' % video_page)
2987 def _real_extract(self, url):
2988 # Extract id from URL
2989 mobj = re.match(self._VALID_URL, url)
2991 raise ExtractorError(u'Invalid URL: %s' % url)
2993 post_url = mobj.group(0)
2994 video_id = mobj.group(1)
2996 video_extension = 'flv'
2998 # Step 1, Retrieve post webpage to extract further information
2999 self.report_extract_entry(post_url)
3000 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3002 # Extract update date
3004 pattern = 'title="Timestamp">(.*?)</a>'
3005 mobj = re.search(pattern, webpage)
3007 upload_date = mobj.group(1)
3008 # Convert timestring to a format suitable for filename
3009 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3010 upload_date = upload_date.strftime('%Y%m%d')
3011 self.report_date(upload_date)
3015 pattern = r'rel\="author".*?>(.*?)</a>'
3016 mobj = re.search(pattern, webpage)
3018 uploader = mobj.group(1)
3019 self.report_uploader(uploader)
3022 # Get the first line for title
3024 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3025 mobj = re.search(pattern, webpage)
3027 video_title = mobj.group(1)
3028 self.report_title(video_title)
3030 # Step 2, Stimulate clicking the image box to launch video
3031 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3032 mobj = re.search(pattern, webpage)
3034 raise ExtractorError(u'Unable to extract video page URL')
3036 video_page = mobj.group(1)
3037 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3038 self.report_extract_vid_page(video_page)
3041 # Extract video links on video page
3042 """Extract video links of all sizes"""
3043 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3044 mobj = re.findall(pattern, webpage)
3046 raise ExtractorError(u'Unable to extract video links')
3048 # Sort in resolution
3049 links = sorted(mobj)
3051 # Choose the lowest of the sort, i.e. highest resolution
3052 video_url = links[-1]
3053 # Only get the url. The resolution part in the tuple has no use anymore
3054 video_url = video_url[-1]
3055 # Treat escaped \u0026 style hex
3057 video_url = video_url.decode("unicode_escape")
3058 except AttributeError: # Python 3
3059 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3065 'uploader': uploader,
3066 'upload_date': upload_date,
3067 'title': video_title,
3068 'ext': video_extension,
3071 class NBAIE(InfoExtractor):
3072 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3075 def _real_extract(self, url):
3076 mobj = re.match(self._VALID_URL, url)
3078 raise ExtractorError(u'Invalid URL: %s' % url)
3080 video_id = mobj.group(1)
3081 if video_id.endswith('/index.html'):
3082 video_id = video_id[:-len('/index.html')]
3084 webpage = self._download_webpage(url, video_id)
3086 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3087 def _findProp(rexp, default=None):
3088 m = re.search(rexp, webpage)
3090 return unescapeHTML(m.group(1))
3094 shortened_video_id = video_id.rpartition('/')[2]
3095 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3097 'id': shortened_video_id,
3101 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3102 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3106 class JustinTVIE(InfoExtractor):
3107 """Information extractor for justin.tv and twitch.tv"""
3108 # TODO: One broadcast may be split into multiple videos. The key
3109 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3110 # starts at 1 and increases. Can we treat all parts as one video?
3112 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3114 (?P<channelid>[^/]+)|
3115 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3116 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3120 _JUSTIN_PAGE_LIMIT = 100
3121 IE_NAME = u'justin.tv'
3123 def report_download_page(self, channel, offset):
3124 """Report attempt to download a single page of videos."""
3125 self.to_screen(u'%s: Downloading video information from %d to %d' %
3126 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3128 # Return count of items, list of *valid* items
3129 def _parse_page(self, url, video_id):
3130 webpage = self._download_webpage(url, video_id,
3131 u'Downloading video info JSON',
3132 u'unable to download video info JSON')
3134 response = json.loads(webpage)
3135 if type(response) != list:
3136 error_text = response.get('error', 'unknown error')
3137 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3139 for clip in response:
3140 video_url = clip['video_file_url']
3142 video_extension = os.path.splitext(video_url)[1][1:]
3143 video_date = re.sub('-', '', clip['start_time'][:10])
3144 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3145 video_id = clip['id']
3146 video_title = clip.get('title', video_id)
3150 'title': video_title,
3151 'uploader': clip.get('channel_name', video_uploader_id),
3152 'uploader_id': video_uploader_id,
3153 'upload_date': video_date,
3154 'ext': video_extension,
3156 return (len(response), info)
3158 def _real_extract(self, url):
3159 mobj = re.match(self._VALID_URL, url)
3161 raise ExtractorError(u'invalid URL: %s' % url)
3163 api_base = 'http://api.justin.tv'
3165 if mobj.group('channelid'):
3167 video_id = mobj.group('channelid')
3168 api = api_base + '/channel/archives/%s.json' % video_id
3169 elif mobj.group('chapterid'):
3170 chapter_id = mobj.group('chapterid')
3172 webpage = self._download_webpage(url, chapter_id)
3173 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3175 raise ExtractorError(u'Cannot find archive of a chapter')
3176 archive_id = m.group(1)
3178 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3179 chapter_info_xml = self._download_webpage(api, chapter_id,
3180 note=u'Downloading chapter information',
3181 errnote=u'Chapter information download failed')
3182 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3183 for a in doc.findall('.//archive'):
3184 if archive_id == a.find('./id').text:
3187 raise ExtractorError(u'Could not find chapter in chapter information')
3189 video_url = a.find('./video_file_url').text
3190 video_ext = video_url.rpartition('.')[2] or u'flv'
3192 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3193 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3194 note='Downloading chapter metadata',
3195 errnote='Download of chapter metadata failed')
3196 chapter_info = json.loads(chapter_info_json)
3198 bracket_start = int(doc.find('.//bracket_start').text)
3199 bracket_end = int(doc.find('.//bracket_end').text)
3201 # TODO determine start (and probably fix up file)
3202 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3203 #video_url += u'?start=' + TODO:start_timestamp
3204 # bracket_start is 13290, but we want 51670615
3205 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3206 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3209 'id': u'c' + chapter_id,
3212 'title': chapter_info['title'],
3213 'thumbnail': chapter_info['preview'],
3214 'description': chapter_info['description'],
3215 'uploader': chapter_info['channel']['display_name'],
3216 'uploader_id': chapter_info['channel']['name'],
3220 video_id = mobj.group('videoid')
3221 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3223 self.report_extraction(video_id)
3227 limit = self._JUSTIN_PAGE_LIMIT
3230 self.report_download_page(video_id, offset)
3231 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3232 page_count, page_info = self._parse_page(page_url, video_id)
3233 info.extend(page_info)
3234 if not paged or page_count != limit:
3239 class FunnyOrDieIE(InfoExtractor):
3240 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3242 def _real_extract(self, url):
3243 mobj = re.match(self._VALID_URL, url)
3245 raise ExtractorError(u'invalid URL: %s' % url)
3247 video_id = mobj.group('id')
3248 webpage = self._download_webpage(url, video_id)
3250 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3252 raise ExtractorError(u'Unable to find video information')
3253 video_url = unescapeHTML(m.group('url'))
3255 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3257 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3259 raise ExtractorError(u'Cannot find video title')
3260 title = clean_html(m.group('title'))
3262 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3264 desc = unescapeHTML(m.group('desc'))
3273 'description': desc,
3277 class SteamIE(InfoExtractor):
3278 _VALID_URL = r"""http://store\.steampowered\.com/
3280 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3282 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3286 def suitable(cls, url):
3287 """Receives a URL and returns True if suitable for this IE."""
3288 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3290 def _real_extract(self, url):
3291 m = re.match(self._VALID_URL, url, re.VERBOSE)
3292 gameID = m.group('gameID')
3293 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3294 self.report_age_confirmation()
3295 webpage = self._download_webpage(videourl, gameID)
3296 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3298 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3299 mweb = re.finditer(urlRE, webpage)
3300 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3301 titles = re.finditer(namesRE, webpage)
3302 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3303 thumbs = re.finditer(thumbsRE, webpage)
3305 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3306 video_id = vid.group('videoID')
3307 title = vtitle.group('videoName')
3308 video_url = vid.group('videoURL')
3309 video_thumb = thumb.group('thumbnail')
3311 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3316 'title': unescapeHTML(title),
3317 'thumbnail': video_thumb
3320 return [self.playlist_result(videos, gameID, game_title)]
3322 class UstreamIE(InfoExtractor):
3323 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3324 IE_NAME = u'ustream'
3326 def _real_extract(self, url):
3327 m = re.match(self._VALID_URL, url)
3328 video_id = m.group('videoID')
3329 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3330 webpage = self._download_webpage(url, video_id)
3331 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3332 title = m.group('title')
3333 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3334 uploader = m.group('uploader')
3340 'uploader': uploader
3344 class WorldStarHipHopIE(InfoExtractor):
3345 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3346 IE_NAME = u'WorldStarHipHop'
3348 def _real_extract(self, url):
3349 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3351 m = re.match(self._VALID_URL, url)
3352 video_id = m.group('id')
3354 webpage_src = self._download_webpage(url, video_id)
3356 mobj = re.search(_src_url, webpage_src)
3358 if mobj is not None:
3359 video_url = mobj.group(1)
3360 if 'mp4' in video_url:
3365 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3367 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3370 raise ExtractorError(u'Cannot determine title')
3371 title = mobj.group(1)
3373 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3374 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3375 if mobj is not None:
3376 thumbnail = mobj.group(1)
3378 _title = r"""candytitles.*>(.*)</span>"""
3379 mobj = re.search(_title, webpage_src)
3380 if mobj is not None:
3381 title = mobj.group(1)
3388 'thumbnail' : thumbnail,
3393 class RBMARadioIE(InfoExtractor):
3394 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3396 def _real_extract(self, url):
3397 m = re.match(self._VALID_URL, url)
3398 video_id = m.group('videoID')
3400 webpage = self._download_webpage(url, video_id)
3401 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3403 raise ExtractorError(u'Cannot find metadata')
3404 json_data = m.group(1)
3407 data = json.loads(json_data)
3408 except ValueError as e:
3409 raise ExtractorError(u'Invalid JSON: ' + str(e))
3411 video_url = data['akamai_url'] + '&cbr=256'
3412 url_parts = compat_urllib_parse_urlparse(video_url)
3413 video_ext = url_parts.path.rpartition('.')[2]
3418 'title': data['title'],
3419 'description': data.get('teaser_text'),
3420 'location': data.get('country_of_origin'),
3421 'uploader': data.get('host', {}).get('name'),
3422 'uploader_id': data.get('host', {}).get('slug'),
3423 'thumbnail': data.get('image', {}).get('large_url_2x'),
3424 'duration': data.get('duration'),
3429 class YouPornIE(InfoExtractor):
3430 """Information extractor for youporn.com."""
3431 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3433 def _print_formats(self, formats):
3434 """Print all available formats"""
3435 print(u'Available formats:')
3436 print(u'ext\t\tformat')
3437 print(u'---------------------------------')
3438 for format in formats:
3439 print(u'%s\t\t%s' % (format['ext'], format['format']))
3441 def _specific(self, req_format, formats):
3443 if(x["format"]==req_format):
3447 def _real_extract(self, url):
3448 mobj = re.match(self._VALID_URL, url)
3450 raise ExtractorError(u'Invalid URL: %s' % url)
3452 video_id = mobj.group('videoid')
3454 req = compat_urllib_request.Request(url)
3455 req.add_header('Cookie', 'age_verified=1')
3456 webpage = self._download_webpage(req, video_id)
3458 # Get the video title
3459 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3461 raise ExtractorError(u'Unable to extract video title')
3462 video_title = result.group('title').strip()
3464 # Get the video date
3465 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3467 self._downloader.report_warning(u'unable to extract video date')
3470 upload_date = unified_strdate(result.group('date').strip())
3472 # Get the video uploader
3473 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3475 self._downloader.report_warning(u'unable to extract uploader')
3476 video_uploader = None
3478 video_uploader = result.group('uploader').strip()
3479 video_uploader = clean_html( video_uploader )
3481 # Get all of the formats available
3482 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3483 result = re.search(DOWNLOAD_LIST_RE, webpage)
3485 raise ExtractorError(u'Unable to extract download list')
3486 download_list_html = result.group('download_list').strip()
3488 # Get all of the links from the page
3489 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3490 links = re.findall(LINK_RE, download_list_html)
3491 if(len(links) == 0):
3492 raise ExtractorError(u'ERROR: no known formats available for video')
3494 self.to_screen(u'Links found: %d' % len(links))
3499 # A link looks like this:
3500 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3501 # A path looks like this:
3502 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3503 video_url = unescapeHTML( link )
3504 path = compat_urllib_parse_urlparse( video_url ).path
3505 extension = os.path.splitext( path )[1][1:]
3506 format = path.split('/')[4].split('_')[:2]
3509 format = "-".join( format )
3510 title = u'%s-%s-%s' % (video_title, size, bitrate)
3515 'uploader': video_uploader,
3516 'upload_date': upload_date,
3521 'description': None,
3525 if self._downloader.params.get('listformats', None):
3526 self._print_formats(formats)
3529 req_format = self._downloader.params.get('format', None)
3530 self.to_screen(u'Format: %s' % req_format)
3532 if req_format is None or req_format == 'best':
3534 elif req_format == 'worst':
3535 return [formats[-1]]
3536 elif req_format in ('-1', 'all'):
3539 format = self._specific( req_format, formats )
3541 raise ExtractorError(u'Requested format not available')
3546 class PornotubeIE(InfoExtractor):
3547 """Information extractor for pornotube.com."""
3548 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3550 def _real_extract(self, url):
3551 mobj = re.match(self._VALID_URL, url)
3553 raise ExtractorError(u'Invalid URL: %s' % url)
3555 video_id = mobj.group('videoid')
3556 video_title = mobj.group('title')
3558 # Get webpage content
3559 webpage = self._download_webpage(url, video_id)
3562 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3563 result = re.search(VIDEO_URL_RE, webpage)
3565 raise ExtractorError(u'Unable to extract video url')
3566 video_url = compat_urllib_parse.unquote(result.group('url'))
3568 #Get the uploaded date
3569 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3570 result = re.search(VIDEO_UPLOADED_RE, webpage)
3572 raise ExtractorError(u'Unable to extract video title')
3573 upload_date = unified_strdate(result.group('date'))
3575 info = {'id': video_id,
3578 'upload_date': upload_date,
3579 'title': video_title,
3585 class YouJizzIE(InfoExtractor):
3586 """Information extractor for youjizz.com."""
3587 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3589 def _real_extract(self, url):
3590 mobj = re.match(self._VALID_URL, url)
3592 raise ExtractorError(u'Invalid URL: %s' % url)
3594 video_id = mobj.group('videoid')
3596 # Get webpage content
3597 webpage = self._download_webpage(url, video_id)
3599 # Get the video title
3600 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3602 raise ExtractorError(u'ERROR: unable to extract video title')
3603 video_title = result.group('title').strip()
3605 # Get the embed page
3606 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3608 raise ExtractorError(u'ERROR: unable to extract embed page')
3610 embed_page_url = result.group(0).strip()
3611 video_id = result.group('videoid')
3613 webpage = self._download_webpage(embed_page_url, video_id)
3616 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3618 raise ExtractorError(u'ERROR: unable to extract video url')
3619 video_url = result.group('source')
3621 info = {'id': video_id,
3623 'title': video_title,
3626 'player_url': embed_page_url}
3630 class EightTracksIE(InfoExtractor):
3632 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3634 def _real_extract(self, url):
3635 mobj = re.match(self._VALID_URL, url)
3637 raise ExtractorError(u'Invalid URL: %s' % url)
3638 playlist_id = mobj.group('id')
3640 webpage = self._download_webpage(url, playlist_id)
3642 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3644 raise ExtractorError(u'Cannot find trax information')
3645 json_like = m.group(1)
3646 data = json.loads(json_like)
3648 session = str(random.randint(0, 1000000000))
3650 track_count = data['tracks_count']
3651 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3652 next_url = first_url
3654 for i in itertools.count():
3655 api_json = self._download_webpage(next_url, playlist_id,
3656 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3657 errnote=u'Failed to download song information')
3658 api_data = json.loads(api_json)
3659 track_data = api_data[u'set']['track']
3661 'id': track_data['id'],
3662 'url': track_data['track_file_stream_url'],
3663 'title': track_data['performer'] + u' - ' + track_data['name'],
3664 'raw_title': track_data['name'],
3665 'uploader_id': data['user']['login'],
3669 if api_data['set']['at_last_track']:
3671 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3674 class KeekIE(InfoExtractor):
3675 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3678 def _real_extract(self, url):
3679 m = re.match(self._VALID_URL, url)
3680 video_id = m.group('videoID')
3681 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3682 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3683 webpage = self._download_webpage(url, video_id)
3684 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3685 title = unescapeHTML(m.group('title'))
3686 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3687 uploader = clean_html(m.group('uploader'))
3693 'thumbnail': thumbnail,
3694 'uploader': uploader
3698 class TEDIE(InfoExtractor):
3699 _VALID_URL=r'''http://www\.ted\.com/
3701 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3703 ((?P<type_talk>talks)) # We have a simple talk
3705 (/lang/(.*?))? # The url may contain the language
3706 /(?P<name>\w+) # Here goes the name and then ".html"
3710 def suitable(cls, url):
3711 """Receives a URL and returns True if suitable for this IE."""
3712 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3714 def _real_extract(self, url):
3715 m=re.match(self._VALID_URL, url, re.VERBOSE)
3716 if m.group('type_talk'):
3717 return [self._talk_info(url)]
3719 playlist_id=m.group('playlist_id')
3720 name=m.group('name')
3721 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3722 return [self._playlist_videos_info(url,name,playlist_id)]
3724 def _talk_video_link(self,mediaSlug):
3725 '''Returns the video link for that mediaSlug'''
3726 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3728 def _playlist_videos_info(self,url,name,playlist_id=0):
3729 '''Returns the videos of the playlist'''
3731 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3732 ([.\s]*?)data-playlist_item_id="(\d+)"
3733 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3735 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3736 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3737 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3738 m_names=re.finditer(video_name_RE,webpage)
3740 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3741 m_playlist = re.search(playlist_RE, webpage)
3742 playlist_title = m_playlist.group('playlist_title')
3744 playlist_entries = []
3745 for m_video, m_name in zip(m_videos,m_names):
3746 video_id=m_video.group('video_id')
3747 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3748 playlist_entries.append(self.url_result(talk_url, 'TED'))
3749 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3751 def _talk_info(self, url, video_id=0):
3752 """Return the video for the talk in the url"""
3753 m=re.match(self._VALID_URL, url,re.VERBOSE)
3754 videoName=m.group('name')
3755 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3756 # If the url includes the language we get the title translated
3757 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3758 title=re.search(title_RE, webpage).group('title')
3759 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3760 "id":(?P<videoID>[\d]+).*?
3761 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3762 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3763 thumb_match=re.search(thumb_RE,webpage)
3764 info_match=re.search(info_RE,webpage,re.VERBOSE)
3765 video_id=info_match.group('videoID')
3766 mediaSlug=info_match.group('mediaSlug')
3767 video_url=self._talk_video_link(mediaSlug)
3773 'thumbnail': thumb_match.group('thumbnail')
3777 class MySpassIE(InfoExtractor):
3778 _VALID_URL = r'http://www.myspass.de/.*'
3780 def _real_extract(self, url):
3781 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3783 # video id is the last path element of the URL
3784 # usually there is a trailing slash, so also try the second but last
3785 url_path = compat_urllib_parse_urlparse(url).path
3786 url_parent_path, video_id = os.path.split(url_path)
3788 _, video_id = os.path.split(url_parent_path)
3791 metadata_url = META_DATA_URL_TEMPLATE % video_id
3792 metadata_text = self._download_webpage(metadata_url, video_id)
3793 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3795 # extract values from metadata
3796 url_flv_el = metadata.find('url_flv')
3797 if url_flv_el is None:
3798 raise ExtractorError(u'Unable to extract download url')
3799 video_url = url_flv_el.text
3800 extension = os.path.splitext(video_url)[1][1:]
3801 title_el = metadata.find('title')
3802 if title_el is None:
3803 raise ExtractorError(u'Unable to extract title')
3804 title = title_el.text
3805 format_id_el = metadata.find('format_id')
3806 if format_id_el is None:
3809 format = format_id_el.text
3810 description_el = metadata.find('description')
3811 if description_el is not None:
3812 description = description_el.text
3815 imagePreview_el = metadata.find('imagePreview')
3816 if imagePreview_el is not None:
3817 thumbnail = imagePreview_el.text
3826 'thumbnail': thumbnail,
3827 'description': description
3831 class SpiegelIE(InfoExtractor):
3832 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3834 def _real_extract(self, url):
3835 m = re.match(self._VALID_URL, url)
3836 video_id = m.group('videoID')
3838 webpage = self._download_webpage(url, video_id)
3839 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3841 raise ExtractorError(u'Cannot find title')
3842 video_title = unescapeHTML(m.group(1))
3844 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3845 xml_code = self._download_webpage(xml_url, video_id,
3846 note=u'Downloading XML', errnote=u'Failed to download XML')
3848 idoc = xml.etree.ElementTree.fromstring(xml_code)
3849 last_type = idoc[-1]
3850 filename = last_type.findall('./filename')[0].text
3851 duration = float(last_type.findall('./duration')[0].text)
3853 video_url = 'http://video2.spiegel.de/flash/' + filename
3854 video_ext = filename.rpartition('.')[2]
3859 'title': video_title,
3860 'duration': duration,
3864 class LiveLeakIE(InfoExtractor):
3866 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3867 IE_NAME = u'liveleak'
3869 def _real_extract(self, url):
3870 mobj = re.match(self._VALID_URL, url)
3872 raise ExtractorError(u'Invalid URL: %s' % url)
3874 video_id = mobj.group('video_id')
3876 webpage = self._download_webpage(url, video_id)
3878 m = re.search(r'file: "(.*?)",', webpage)
3880 raise ExtractorError(u'Unable to find video url')
3881 video_url = m.group(1)
3883 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3885 raise ExtractorError(u'Cannot find video title')
3886 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3888 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3890 desc = unescapeHTML(m.group('desc'))
3894 m = re.search(r'By:.*?(\w+)</a>', webpage)
3896 uploader = clean_html(m.group(1))
3905 'description': desc,
3906 'uploader': uploader
3911 class ARDIE(InfoExtractor):
3912 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3913 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3914 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3916 def _real_extract(self, url):
3917 # determine video id from url
3918 m = re.match(self._VALID_URL, url)
3920 numid = re.search(r'documentId=([0-9]+)', url)
3922 video_id = numid.group(1)
3924 video_id = m.group('video_id')
3926 # determine title and media streams from webpage
3927 html = self._download_webpage(url, video_id)
3928 title = re.search(self._TITLE, html).group('title')
3929 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3931 assert '"fsk"' in html
3932 raise ExtractorError(u'This video is only available after 8:00 pm')
3934 # choose default media type and highest quality for now
3935 stream = max([s for s in streams if int(s["media_type"]) == 0],
3936 key=lambda s: int(s["quality"]))
3938 # there's two possibilities: RTMP stream or HTTP download
3939 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3940 if stream['rtmp_url']:
3941 self.to_screen(u'RTMP download detected')
3942 assert stream['video_url'].startswith('mp4:')
3943 info["url"] = stream["rtmp_url"]
3944 info["play_path"] = stream['video_url']
3946 assert stream["video_url"].endswith('.mp4')
3947 info["url"] = stream["video_url"]
3950 class TumblrIE(InfoExtractor):
3951 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3953 def _real_extract(self, url):
3954 m_url = re.match(self._VALID_URL, url)
3955 video_id = m_url.group('id')
3956 blog = m_url.group('blog_name')
3958 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3959 webpage = self._download_webpage(url, video_id)
3961 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3962 video = re.search(re_video, webpage)
3964 self.to_screen("No video founded")
3966 video_url = video.group('video_url')
3967 ext = video.group('ext')
3969 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3970 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3972 # The only place where you can get a title, it's not complete,
3973 # but searching in other places doesn't work for all videos
3974 re_title = r'<title>(?P<title>.*?)</title>'
3975 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3977 return [{'id': video_id,
3984 class BandcampIE(InfoExtractor):
3985 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3987 def _real_extract(self, url):
3988 mobj = re.match(self._VALID_URL, url)
3989 title = mobj.group('title')
3990 webpage = self._download_webpage(url, title)
3991 # We get the link to the free download page
3992 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3993 if m_download is None:
3994 raise ExtractorError(u'No free songs founded')
3996 download_link = m_download.group(1)
3997 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3998 webpage, re.MULTILINE|re.DOTALL).group('id')
4000 download_webpage = self._download_webpage(download_link, id,
4001 'Downloading free downloads page')
4002 # We get the dictionary of the track from some javascrip code
4003 info = re.search(r'items: (.*?),$',
4004 download_webpage, re.MULTILINE).group(1)
4005 info = json.loads(info)[0]
4006 # We pick mp3-320 for now, until format selection can be easily implemented.
4007 mp3_info = info[u'downloads'][u'mp3-320']
4008 # If we try to use this url it says the link has expired
4009 initial_url = mp3_info[u'url']
4010 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4011 m_url = re.match(re_url, initial_url)
4012 #We build the url we will use to get the final track url
4013 # This url is build in Bandcamp in the script download_bunde_*.js
4014 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4015 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4016 # If we could correctly generate the .rand field the url would be
4017 #in the "download_url" key
4018 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4020 track_info = {'id':id,
4021 'title' : info[u'title'],
4024 'thumbnail' : info[u'thumb_url'],
4025 'uploader' : info[u'artist']
4030 class RedTubeIE(InfoExtractor):
4031 """Information Extractor for redtube"""
4032 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4034 def _real_extract(self,url):
4035 mobj = re.match(self._VALID_URL, url)
4037 raise ExtractorError(u'Invalid URL: %s' % url)
4039 video_id = mobj.group('id')
4040 video_extension = 'mp4'
4041 webpage = self._download_webpage(url, video_id)
4042 self.report_extraction(video_id)
4043 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4046 raise ExtractorError(u'Unable to extract media URL')
4048 video_url = mobj.group(1)
4049 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4051 raise ExtractorError(u'Unable to extract title')
4052 video_title = mobj.group(1)
4057 'ext': video_extension,
4058 'title': video_title,
4061 class InaIE(InfoExtractor):
4062 """Information Extractor for Ina.fr"""
4063 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4065 def _real_extract(self,url):
4066 mobj = re.match(self._VALID_URL, url)
4068 video_id = mobj.group('id')
4069 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4070 video_extension = 'mp4'
4071 webpage = self._download_webpage(mrss_url, video_id)
4073 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4075 raise ExtractorError(u'Unable to extract media URL')
4076 video_url = mobj.group(1)
4078 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4080 raise ExtractorError(u'Unable to extract title')
4081 video_title = mobj.group(1)
4086 'ext': video_extension,
4087 'title': video_title,
4090 def gen_extractors():
4091 """ Return a list of an instance of every supported extractor.
4092 The order does matter; the first extractor matched is the one handling the URL.
4095 YoutubePlaylistIE(),
4120 StanfordOpenClassroomIE(),
4130 WorldStarHipHopIE(),
4150 def get_info_extractor(ie_name):
4151 """Returns the info extractor class with the given ie_name"""
4152 return globals()[ie_name+'IE']