2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
191 class SearchInfoExtractor(InfoExtractor):
193 Base class for paged search queries extractors.
194 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
195 Instances should define _SEARCH_KEY and _MAX_RESULTS.
199 def _make_valid_url(cls):
200 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
203 def suitable(cls, url):
204 return re.match(cls._make_valid_url(), url) is not None
206 def _real_extract(self, query):
207 mobj = re.match(self._make_valid_url(), query)
209 raise ExtractorError(u'Invalid search query "%s"' % query)
211 prefix = mobj.group('prefix')
212 query = mobj.group('query')
214 return self._get_n_results(query, 1)
215 elif prefix == 'all':
216 return self._get_n_results(query, self._MAX_RESULTS)
220 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
221 elif n > self._MAX_RESULTS:
222 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
223 n = self._MAX_RESULTS
224 return self._get_n_results(query, n)
226 def _get_n_results(self, query, n):
227 """Get a specified number of results for a query"""
228 raise NotImplementedError("This method must be implemented by sublclasses")
231 class YoutubeIE(InfoExtractor):
232 """Information extractor for youtube.com."""
236 (?:https?://)? # http(s):// (optional)
237 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
238 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
239 (?:.*?\#/)? # handle anchor (#/) redirect urls
240 (?: # the various things that can precede the ID:
241 (?:(?:v|embed|e)/) # v/ or embed/ or e/
242 |(?: # or the v= param in all its forms
243 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
244 (?:\?|\#!?) # the params delimiter ? or # or #!
245 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
248 )? # optional -> youtube.com/xxxx is OK
249 )? # all until now is optional -> you can pass the naked ID
250 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
251 (?(1).+)? # if we found the ID, everything can follow
253 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
254 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
255 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
256 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
257 _NETRC_MACHINE = 'youtube'
258 # Listed in order of quality
259 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
260 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
261 _video_extensions = {
267 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
273 _video_dimensions = {
292 def suitable(cls, url):
293 """Receives a URL and returns True if suitable for this IE."""
294 if YoutubePlaylistIE.suitable(url): return False
295 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
297 def report_lang(self):
298 """Report attempt to set language."""
299 self.to_screen(u'Setting language')
301 def report_login(self):
302 """Report attempt to log in."""
303 self.to_screen(u'Logging in')
305 def report_video_webpage_download(self, video_id):
306 """Report attempt to download video webpage."""
307 self.to_screen(u'%s: Downloading video webpage' % video_id)
309 def report_video_info_webpage_download(self, video_id):
310 """Report attempt to download video info webpage."""
311 self.to_screen(u'%s: Downloading video info webpage' % video_id)
313 def report_video_subtitles_download(self, video_id):
314 """Report attempt to download video info webpage."""
315 self.to_screen(u'%s: Checking available subtitles' % video_id)
317 def report_video_subtitles_request(self, video_id, sub_lang, format):
318 """Report attempt to download video info webpage."""
319 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
321 def report_video_subtitles_available(self, video_id, sub_lang_list):
322 """Report available subtitles."""
323 sub_lang = ",".join(list(sub_lang_list.keys()))
324 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
326 def report_information_extraction(self, video_id):
327 """Report attempt to extract video information."""
328 self.to_screen(u'%s: Extracting video information' % video_id)
330 def report_unavailable_format(self, video_id, format):
331 """Report extracted video URL."""
332 self.to_screen(u'%s: Format %s not available' % (video_id, format))
334 def report_rtmp_download(self):
335 """Indicate the download will use the RTMP protocol."""
336 self.to_screen(u'RTMP download detected')
338 def _get_available_subtitles(self, video_id):
339 self.report_video_subtitles_download(video_id)
340 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
342 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344 return (u'unable to download video subtitles: %s' % compat_str(err), None)
345 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
346 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
347 if not sub_lang_list:
348 return (u'video doesn\'t have subtitles', None)
351 def _list_available_subtitles(self, video_id):
352 sub_lang_list = self._get_available_subtitles(video_id)
353 self.report_video_subtitles_available(video_id, sub_lang_list)
355 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
358 (error_message, sub_lang, sub)
360 self.report_video_subtitles_request(video_id, sub_lang, format)
361 params = compat_urllib_parse.urlencode({
367 url = 'http://www.youtube.com/api/timedtext?' + params
369 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
373 return (u'Did not fetch video subtitles', None, None)
374 return (None, sub_lang, sub)
376 def _extract_subtitle(self, video_id):
378 Return a list with a tuple:
379 [(error_message, sub_lang, sub)]
381 sub_lang_list = self._get_available_subtitles(video_id)
382 sub_format = self._downloader.params.get('subtitlesformat')
383 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
384 return [(sub_lang_list[0], None, None)]
385 if self._downloader.params.get('subtitleslang', False):
386 sub_lang = self._downloader.params.get('subtitleslang')
387 elif 'en' in sub_lang_list:
390 sub_lang = list(sub_lang_list.keys())[0]
391 if not sub_lang in sub_lang_list:
392 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
394 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
397 def _extract_all_subtitles(self, video_id):
398 sub_lang_list = self._get_available_subtitles(video_id)
399 sub_format = self._downloader.params.get('subtitlesformat')
400 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
401 return [(sub_lang_list[0], None, None)]
403 for sub_lang in sub_lang_list:
404 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
405 subtitles.append(subtitle)
408 def _print_formats(self, formats):
409 print('Available formats:')
411 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
413 def _real_initialize(self):
414 if self._downloader is None:
419 downloader_params = self._downloader.params
421 # Attempt to use provided username and password or .netrc data
422 if downloader_params.get('username', None) is not None:
423 username = downloader_params['username']
424 password = downloader_params['password']
425 elif downloader_params.get('usenetrc', False):
427 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
432 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
433 except (IOError, netrc.NetrcParseError) as err:
434 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
438 request = compat_urllib_request.Request(self._LANG_URL)
441 compat_urllib_request.urlopen(request).read()
442 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
443 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
446 # No authentication to be performed
450 request = compat_urllib_request.Request(self._LOGIN_URL)
452 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
454 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
459 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
461 galx = match.group(1)
463 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
469 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
473 u'PersistentCookie': u'yes',
475 u'bgresponse': u'js_disabled',
476 u'checkConnection': u'',
477 u'checkedDomains': u'youtube',
483 u'signIn': u'Sign in',
485 u'service': u'youtube',
489 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
491 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
492 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
493 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
496 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
497 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
498 self._downloader.report_warning(u'unable to log in: bad username or password')
500 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
501 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
507 'action_confirm': 'Confirm',
509 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
511 self.report_age_confirmation()
512 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
514 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
516 def _extract_id(self, url):
517 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
519 raise ExtractorError(u'Invalid URL: %s' % url)
520 video_id = mobj.group(2)
523 def _real_extract(self, url):
524 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
525 mobj = re.search(self._NEXT_URL_RE, url)
527 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
528 video_id = self._extract_id(url)
531 self.report_video_webpage_download(video_id)
532 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
533 request = compat_urllib_request.Request(url)
535 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
539 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
541 # Attempt to extract SWF player URL
542 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
544 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
549 self.report_video_info_webpage_download(video_id)
550 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
551 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
552 % (video_id, el_type))
553 video_info_webpage = self._download_webpage(video_info_url, video_id,
555 errnote='unable to download video info webpage')
556 video_info = compat_parse_qs(video_info_webpage)
557 if 'token' in video_info:
559 if 'token' not in video_info:
560 if 'reason' in video_info:
561 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
563 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
565 # Check for "rental" videos
566 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
567 raise ExtractorError(u'"rental" videos not supported')
569 # Start extracting information
570 self.report_information_extraction(video_id)
573 if 'author' not in video_info:
574 raise ExtractorError(u'Unable to extract uploader name')
575 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
578 video_uploader_id = None
579 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
581 video_uploader_id = mobj.group(1)
583 self._downloader.report_warning(u'unable to extract uploader nickname')
586 if 'title' not in video_info:
587 raise ExtractorError(u'Unable to extract video title')
588 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
591 if 'thumbnail_url' not in video_info:
592 self._downloader.report_warning(u'unable to extract video thumbnail')
594 else: # don't panic if we can't find it
595 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
599 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
601 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
602 upload_date = unified_strdate(upload_date)
605 video_description = get_element_by_id("eow-description", video_webpage)
606 if video_description:
607 video_description = clean_html(video_description)
609 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
611 video_description = unescapeHTML(fd_mobj.group(1))
613 video_description = u''
616 video_subtitles = None
618 if self._downloader.params.get('writesubtitles', False):
619 video_subtitles = self._extract_subtitle(video_id)
621 (sub_error, sub_lang, sub) = video_subtitles[0]
623 self._downloader.report_error(sub_error)
625 if self._downloader.params.get('allsubtitles', False):
626 video_subtitles = self._extract_all_subtitles(video_id)
627 for video_subtitle in video_subtitles:
628 (sub_error, sub_lang, sub) = video_subtitle
630 self._downloader.report_error(sub_error)
632 if self._downloader.params.get('listsubtitles', False):
633 sub_lang_list = self._list_available_subtitles(video_id)
636 if 'length_seconds' not in video_info:
637 self._downloader.report_warning(u'unable to extract video duration')
640 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
643 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
645 # Decide which formats to download
646 req_format = self._downloader.params.get('format', None)
648 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
649 self.report_rtmp_download()
650 video_url_list = [(None, video_info['conn'][0])]
651 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
653 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
654 url_data = compat_parse_qs(url_data_str)
655 if 'itag' in url_data and 'url' in url_data:
656 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
657 if not 'ratebypass' in url: url += '&ratebypass=yes'
658 url_map[url_data['itag'][0]] = url
660 format_limit = self._downloader.params.get('format_limit', None)
661 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
662 if format_limit is not None and format_limit in available_formats:
663 format_list = available_formats[available_formats.index(format_limit):]
665 format_list = available_formats
666 existing_formats = [x for x in format_list if x in url_map]
667 if len(existing_formats) == 0:
668 raise ExtractorError(u'no known formats available for video')
669 if self._downloader.params.get('listformats', None):
670 self._print_formats(existing_formats)
672 if req_format is None or req_format == 'best':
673 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
674 elif req_format == 'worst':
675 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
676 elif req_format in ('-1', 'all'):
677 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
679 # Specific formats. We pick the first in a slash-delimeted sequence.
680 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
681 req_formats = req_format.split('/')
682 video_url_list = None
683 for rf in req_formats:
685 video_url_list = [(rf, url_map[rf])]
687 if video_url_list is None:
688 raise ExtractorError(u'requested format not available')
690 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
693 for format_param, video_real_url in video_url_list:
695 video_extension = self._video_extensions.get(format_param, 'flv')
697 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
698 self._video_dimensions.get(format_param, '???'))
702 'url': video_real_url,
703 'uploader': video_uploader,
704 'uploader_id': video_uploader_id,
705 'upload_date': upload_date,
706 'title': video_title,
707 'ext': video_extension,
708 'format': video_format,
709 'thumbnail': video_thumbnail,
710 'description': video_description,
711 'player_url': player_url,
712 'subtitles': video_subtitles,
713 'duration': video_duration
718 class MetacafeIE(InfoExtractor):
719 """Information Extractor for metacafe.com."""
721 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
722 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
723 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
724 IE_NAME = u'metacafe'
726 def report_disclaimer(self):
727 """Report disclaimer retrieval."""
728 self.to_screen(u'Retrieving disclaimer')
730 def _real_initialize(self):
731 # Retrieve disclaimer
732 request = compat_urllib_request.Request(self._DISCLAIMER)
734 self.report_disclaimer()
735 disclaimer = compat_urllib_request.urlopen(request).read()
736 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
737 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
742 'submit': "Continue - I'm over 18",
744 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
746 self.report_age_confirmation()
747 disclaimer = compat_urllib_request.urlopen(request).read()
748 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
749 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
751 def _real_extract(self, url):
752 # Extract id and simplified title from URL
753 mobj = re.match(self._VALID_URL, url)
755 raise ExtractorError(u'Invalid URL: %s' % url)
757 video_id = mobj.group(1)
759 # Check if video comes from YouTube
760 mobj2 = re.match(r'^yt-(.*)$', video_id)
761 if mobj2 is not None:
762 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
764 # Retrieve video webpage to extract further information
765 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
767 # Extract URL, uploader and title from webpage
768 self.report_extraction(video_id)
769 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
771 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
772 video_extension = mediaURL[-3:]
774 # Extract gdaKey if available
775 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
779 gdaKey = mobj.group(1)
780 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
782 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
784 raise ExtractorError(u'Unable to extract media URL')
785 vardict = compat_parse_qs(mobj.group(1))
786 if 'mediaData' not in vardict:
787 raise ExtractorError(u'Unable to extract media URL')
788 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
790 raise ExtractorError(u'Unable to extract media URL')
791 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
792 video_extension = mediaURL[-3:]
793 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
795 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
797 raise ExtractorError(u'Unable to extract title')
798 video_title = mobj.group(1).decode('utf-8')
800 mobj = re.search(r'submitter=(.*?);', webpage)
802 raise ExtractorError(u'Unable to extract uploader nickname')
803 video_uploader = mobj.group(1)
806 'id': video_id.decode('utf-8'),
807 'url': video_url.decode('utf-8'),
808 'uploader': video_uploader.decode('utf-8'),
810 'title': video_title,
811 'ext': video_extension.decode('utf-8'),
814 class DailymotionIE(InfoExtractor):
815 """Information Extractor for Dailymotion"""
817 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
818 IE_NAME = u'dailymotion'
820 def _real_extract(self, url):
821 # Extract id and simplified title from URL
822 mobj = re.match(self._VALID_URL, url)
824 raise ExtractorError(u'Invalid URL: %s' % url)
826 video_id = mobj.group(1).split('_')[0].split('?')[0]
828 video_extension = 'mp4'
830 # Retrieve video webpage to extract further information
831 request = compat_urllib_request.Request(url)
832 request.add_header('Cookie', 'family_filter=off')
833 webpage = self._download_webpage(request, video_id)
835 # Extract URL, uploader and title from webpage
836 self.report_extraction(video_id)
837 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
839 raise ExtractorError(u'Unable to extract media URL')
840 flashvars = compat_urllib_parse.unquote(mobj.group(1))
842 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
845 self.to_screen(u'Using %s' % key)
848 raise ExtractorError(u'Unable to extract video URL')
850 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
852 raise ExtractorError(u'Unable to extract video URL')
854 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
856 # TODO: support choosing qualities
858 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
860 raise ExtractorError(u'Unable to extract title')
861 video_title = unescapeHTML(mobj.group('title'))
863 video_uploader = None
864 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
866 # lookin for official user
867 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
868 if mobj_official is None:
869 self._downloader.report_warning(u'unable to extract uploader nickname')
871 video_uploader = mobj_official.group(1)
873 video_uploader = mobj.group(1)
875 video_upload_date = None
876 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
878 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
883 'uploader': video_uploader,
884 'upload_date': video_upload_date,
885 'title': video_title,
886 'ext': video_extension,
890 class PhotobucketIE(InfoExtractor):
891 """Information extractor for photobucket.com."""
893 # TODO: the original _VALID_URL was:
894 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
895 # Check if it's necessary to keep the old extracion process
896 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
897 IE_NAME = u'photobucket'
899 def _real_extract(self, url):
900 # Extract id from URL
901 mobj = re.match(self._VALID_URL, url)
903 raise ExtractorError(u'Invalid URL: %s' % url)
905 video_id = mobj.group('id')
907 video_extension = mobj.group('ext')
909 # Retrieve video webpage to extract further information
910 webpage = self._download_webpage(url, video_id)
912 # Extract URL, uploader, and title from webpage
913 self.report_extraction(video_id)
914 # We try first by looking the javascript code:
915 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
917 info = json.loads(mobj.group('json'))
920 'url': info[u'downloadUrl'],
921 'uploader': info[u'username'],
922 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
923 'title': info[u'title'],
924 'ext': video_extension,
925 'thumbnail': info[u'thumbUrl'],
928 # We try looking in other parts of the webpage
929 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
931 raise ExtractorError(u'Unable to extract media URL')
932 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
936 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
938 raise ExtractorError(u'Unable to extract title')
939 video_title = mobj.group(1).decode('utf-8')
941 video_uploader = mobj.group(2).decode('utf-8')
944 'id': video_id.decode('utf-8'),
945 'url': video_url.decode('utf-8'),
946 'uploader': video_uploader,
948 'title': video_title,
949 'ext': video_extension.decode('utf-8'),
953 class YahooIE(InfoExtractor):
954 """Information extractor for screen.yahoo.com."""
955 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
957 def _real_extract(self, url):
958 mobj = re.match(self._VALID_URL, url)
960 raise ExtractorError(u'Invalid URL: %s' % url)
961 video_id = mobj.group('id')
962 webpage = self._download_webpage(url, video_id)
963 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
966 # TODO: Check which url parameters are required
967 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
968 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
969 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
970 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
971 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
972 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
974 self.report_extraction(video_id)
975 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
977 raise ExtractorError(u'Unable to extract video info')
978 video_title = m_info.group('title')
979 video_description = m_info.group('description')
980 video_thumb = m_info.group('thumb')
981 video_date = m_info.group('date')
982 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
984 # TODO: Find a way to get mp4 videos
985 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
986 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
987 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
988 video_url = m_rest.group('url')
989 video_path = m_rest.group('path')
991 raise ExtractorError(u'Unable to extract video url')
993 else: # We have to use a different method if another id is defined
994 long_id = m_id.group('new_id')
995 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
996 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
997 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
998 info = json.loads(json_str)
999 res = info[u'query'][u'results'][u'mediaObj'][0]
1000 stream = res[u'streams'][0]
1001 video_path = stream[u'path']
1002 video_url = stream[u'host']
1004 video_title = meta[u'title']
1005 video_description = meta[u'description']
1006 video_thumb = meta[u'thumbnail']
1007 video_date = None # I can't find it
1012 'play_path': video_path,
1013 'title':video_title,
1014 'description': video_description,
1015 'thumbnail': video_thumb,
1016 'upload_date': video_date,
1021 class VimeoIE(InfoExtractor):
1022 """Information extractor for vimeo.com."""
1024 # _VALID_URL matches Vimeo URLs
1025 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1028 def _real_extract(self, url, new_video=True):
1029 # Extract ID from URL
1030 mobj = re.match(self._VALID_URL, url)
1032 raise ExtractorError(u'Invalid URL: %s' % url)
1034 video_id = mobj.group('id')
1035 if not mobj.group('proto'):
1036 url = 'https://' + url
1037 if mobj.group('direct_link'):
1038 url = 'https://vimeo.com/' + video_id
1040 # Retrieve video webpage to extract further information
1041 request = compat_urllib_request.Request(url, None, std_headers)
1042 webpage = self._download_webpage(request, video_id)
1044 # Now we begin extracting as much information as we can from what we
1045 # retrieved. First we extract the information common to all extractors,
1046 # and latter we extract those that are Vimeo specific.
1047 self.report_extraction(video_id)
1049 # Extract the config JSON
1051 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1052 config = json.loads(config)
1054 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1055 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1057 raise ExtractorError(u'Unable to extract info section')
1060 video_title = config["video"]["title"]
1062 # Extract uploader and uploader_id
1063 video_uploader = config["video"]["owner"]["name"]
1064 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1066 # Extract video thumbnail
1067 video_thumbnail = config["video"]["thumbnail"]
1069 # Extract video description
1070 video_description = get_element_by_attribute("itemprop", "description", webpage)
1071 if video_description: video_description = clean_html(video_description)
1072 else: video_description = u''
1074 # Extract upload date
1075 video_upload_date = None
1076 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1077 if mobj is not None:
1078 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1080 # Vimeo specific: extract request signature and timestamp
1081 sig = config['request']['signature']
1082 timestamp = config['request']['timestamp']
1084 # Vimeo specific: extract video codec and quality information
1085 # First consider quality, then codecs, then take everything
1086 # TODO bind to format param
1087 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1088 files = { 'hd': [], 'sd': [], 'other': []}
1089 for codec_name, codec_extension in codecs:
1090 if codec_name in config["video"]["files"]:
1091 if 'hd' in config["video"]["files"][codec_name]:
1092 files['hd'].append((codec_name, codec_extension, 'hd'))
1093 elif 'sd' in config["video"]["files"][codec_name]:
1094 files['sd'].append((codec_name, codec_extension, 'sd'))
1096 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1098 for quality in ('hd', 'sd', 'other'):
1099 if len(files[quality]) > 0:
1100 video_quality = files[quality][0][2]
1101 video_codec = files[quality][0][0]
1102 video_extension = files[quality][0][1]
1103 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1106 raise ExtractorError(u'No known codec found')
1108 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1109 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1114 'uploader': video_uploader,
1115 'uploader_id': video_uploader_id,
1116 'upload_date': video_upload_date,
1117 'title': video_title,
1118 'ext': video_extension,
1119 'thumbnail': video_thumbnail,
1120 'description': video_description,
1124 class ArteTvIE(InfoExtractor):
1125 """arte.tv information extractor."""
1127 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1128 _LIVE_URL = r'index-[0-9]+\.html$'
1130 IE_NAME = u'arte.tv'
1132 def fetch_webpage(self, url):
1133 request = compat_urllib_request.Request(url)
1135 self.report_download_webpage(url)
1136 webpage = compat_urllib_request.urlopen(request).read()
1137 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1138 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1139 except ValueError as err:
1140 raise ExtractorError(u'Invalid URL: %s' % url)
1143 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1144 page = self.fetch_webpage(url)
1145 mobj = re.search(regex, page, regexFlags)
1149 raise ExtractorError(u'Invalid URL: %s' % url)
1151 for (i, key, err) in matchTuples:
1152 if mobj.group(i) is None:
1153 raise ExtractorError(err)
1155 info[key] = mobj.group(i)
1159 def extractLiveStream(self, url):
1160 video_lang = url.split('/')[-4]
1161 info = self.grep_webpage(
1163 r'src="(.*?/videothek_js.*?\.js)',
1166 (1, 'url', u'Invalid URL: %s' % url)
1169 http_host = url.split('/')[2]
1170 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1171 info = self.grep_webpage(
1173 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1174 '(http://.*?\.swf).*?' +
1178 (1, 'path', u'could not extract video path: %s' % url),
1179 (2, 'player', u'could not extract video player: %s' % url),
1180 (3, 'url', u'could not extract video url: %s' % url)
1183 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1185 def extractPlus7Stream(self, url):
1186 video_lang = url.split('/')[-3]
1187 info = self.grep_webpage(
1189 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1192 (1, 'url', u'Invalid URL: %s' % url)
1195 next_url = compat_urllib_parse.unquote(info.get('url'))
1196 info = self.grep_webpage(
1198 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1201 (1, 'url', u'Could not find <video> tag: %s' % url)
1204 next_url = compat_urllib_parse.unquote(info.get('url'))
1206 info = self.grep_webpage(
1208 r'<video id="(.*?)".*?>.*?' +
1209 '<name>(.*?)</name>.*?' +
1210 '<dateVideo>(.*?)</dateVideo>.*?' +
1211 '<url quality="hd">(.*?)</url>',
1214 (1, 'id', u'could not extract video id: %s' % url),
1215 (2, 'title', u'could not extract video title: %s' % url),
1216 (3, 'date', u'could not extract video date: %s' % url),
1217 (4, 'url', u'could not extract video url: %s' % url)
1222 'id': info.get('id'),
1223 'url': compat_urllib_parse.unquote(info.get('url')),
1224 'uploader': u'arte.tv',
1225 'upload_date': unified_strdate(info.get('date')),
1226 'title': info.get('title').decode('utf-8'),
1232 def _real_extract(self, url):
1233 video_id = url.split('/')[-1]
1234 self.report_extraction(video_id)
1236 if re.search(self._LIVE_URL, video_id) is not None:
1237 self.extractLiveStream(url)
1240 info = self.extractPlus7Stream(url)
1245 class GenericIE(InfoExtractor):
1246 """Generic last-resort information extractor."""
1249 IE_NAME = u'generic'
1251 def report_download_webpage(self, video_id):
1252 """Report webpage download."""
1253 if not self._downloader.params.get('test', False):
1254 self._downloader.report_warning(u'Falling back on generic information extractor.')
1255 super(GenericIE, self).report_download_webpage(video_id)
1257 def report_following_redirect(self, new_url):
1258 """Report information extraction."""
1259 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1261 def _test_redirect(self, url):
1262 """Check if it is a redirect, like url shorteners, in case return the new url."""
1263 class HeadRequest(compat_urllib_request.Request):
1264 def get_method(self):
1267 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1269 Subclass the HTTPRedirectHandler to make it use our
1270 HeadRequest also on the redirected URL
1272 def redirect_request(self, req, fp, code, msg, headers, newurl):
1273 if code in (301, 302, 303, 307):
1274 newurl = newurl.replace(' ', '%20')
1275 newheaders = dict((k,v) for k,v in req.headers.items()
1276 if k.lower() not in ("content-length", "content-type"))
1277 return HeadRequest(newurl,
1279 origin_req_host=req.get_origin_req_host(),
1282 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1284 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1286 Fallback to GET if HEAD is not allowed (405 HTTP error)
1288 def http_error_405(self, req, fp, code, msg, headers):
1292 newheaders = dict((k,v) for k,v in req.headers.items()
1293 if k.lower() not in ("content-length", "content-type"))
1294 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1296 origin_req_host=req.get_origin_req_host(),
1300 opener = compat_urllib_request.OpenerDirector()
1301 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1302 HTTPMethodFallback, HEADRedirectHandler,
1303 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1304 opener.add_handler(handler())
1306 response = opener.open(HeadRequest(url))
1307 if response is None:
1308 raise ExtractorError(u'Invalid URL protocol')
1309 new_url = response.geturl()
1314 self.report_following_redirect(new_url)
1317 def _real_extract(self, url):
1318 new_url = self._test_redirect(url)
1319 if new_url: return [self.url_result(new_url)]
1321 video_id = url.split('/')[-1]
1323 webpage = self._download_webpage(url, video_id)
1324 except ValueError as err:
1325 # since this is the last-resort InfoExtractor, if
1326 # this error is thrown, it'll be thrown here
1327 raise ExtractorError(u'Invalid URL: %s' % url)
1329 self.report_extraction(video_id)
1330 # Start with something easy: JW Player in SWFObject
1331 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1333 # Broaden the search a little bit
1334 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1336 # Broaden the search a little bit: JWPlayer JS loader
1337 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1339 raise ExtractorError(u'Invalid URL: %s' % url)
1341 # It's possible that one of the regexes
1342 # matched, but returned an empty group:
1343 if mobj.group(1) is None:
1344 raise ExtractorError(u'Invalid URL: %s' % url)
1346 video_url = compat_urllib_parse.unquote(mobj.group(1))
1347 video_id = os.path.basename(video_url)
1349 # here's a fun little line of code for you:
1350 video_extension = os.path.splitext(video_id)[1][1:]
1351 video_id = os.path.splitext(video_id)[0]
1353 # it's tempting to parse this further, but you would
1354 # have to take into account all the variations like
1355 # Video Title - Site Name
1356 # Site Name | Video Title
1357 # Video Title - Tagline | Site Name
1358 # and so on and so forth; it's just not practical
1359 mobj = re.search(r'<title>(.*)</title>', webpage)
1361 raise ExtractorError(u'Unable to extract title')
1362 video_title = mobj.group(1)
1364 # video uploader is domain name
1365 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1367 raise ExtractorError(u'Unable to extract title')
1368 video_uploader = mobj.group(1)
1373 'uploader': video_uploader,
1374 'upload_date': None,
1375 'title': video_title,
1376 'ext': video_extension,
1380 class YoutubeSearchIE(SearchInfoExtractor):
1381 """Information Extractor for YouTube search queries."""
1382 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1384 IE_NAME = u'youtube:search'
1385 _SEARCH_KEY = 'ytsearch'
1387 def report_download_page(self, query, pagenum):
1388 """Report attempt to download search page with given number."""
1389 query = query.decode(preferredencoding())
1390 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1392 def _get_n_results(self, query, n):
1393 """Get a specified number of results for a query"""
1399 while (50 * pagenum) < limit:
1400 self.report_download_page(query, pagenum+1)
1401 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1402 request = compat_urllib_request.Request(result_url)
1404 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1406 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1407 api_response = json.loads(data)['data']
1409 if not 'items' in api_response:
1410 raise ExtractorError(u'[youtube] No video results')
1412 new_ids = list(video['id'] for video in api_response['items'])
1413 video_ids += new_ids
1415 limit = min(n, api_response['totalItems'])
1418 if len(video_ids) > n:
1419 video_ids = video_ids[:n]
1420 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1421 return self.playlist_result(videos, query)
1424 class GoogleSearchIE(SearchInfoExtractor):
1425 """Information Extractor for Google Video search queries."""
1426 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1428 IE_NAME = u'video.google:search'
1429 _SEARCH_KEY = 'gvsearch'
1431 def _get_n_results(self, query, n):
1432 """Get a specified number of results for a query"""
1435 '_type': 'playlist',
1440 for pagenum in itertools.count(1):
1441 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1442 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1443 note='Downloading result page ' + str(pagenum))
1445 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1448 'url': mobj.group(1)
1450 res['entries'].append(e)
1452 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1455 class YahooSearchIE(SearchInfoExtractor):
1456 """Information Extractor for Yahoo! Video search queries."""
1459 IE_NAME = u'screen.yahoo:search'
1460 _SEARCH_KEY = 'yvsearch'
1462 def _get_n_results(self, query, n):
1463 """Get a specified number of results for a query"""
1466 '_type': 'playlist',
1470 for pagenum in itertools.count(0):
1471 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1472 webpage = self._download_webpage(result_url, query,
1473 note='Downloading results page '+str(pagenum+1))
1474 info = json.loads(webpage)
1476 results = info[u'results']
1478 for (i, r) in enumerate(results):
1479 if (pagenum * 30) +i >= n:
1481 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1482 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1483 res['entries'].append(e)
1484 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1490 class YoutubePlaylistIE(InfoExtractor):
1491 """Information Extractor for YouTube playlists."""
1493 _VALID_URL = r"""(?:
1498 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1499 \? (?:.*?&)*? (?:p|a|list)=
1502 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1505 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1507 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1509 IE_NAME = u'youtube:playlist'
1512 def suitable(cls, url):
1513 """Receives a URL and returns True if suitable for this IE."""
1514 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1516 def _real_extract(self, url):
1517 # Extract playlist id
1518 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1520 raise ExtractorError(u'Invalid URL: %s' % url)
1522 # Download playlist videos from API
1523 playlist_id = mobj.group(1) or mobj.group(2)
1528 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1529 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1532 response = json.loads(page)
1533 except ValueError as err:
1534 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1536 if 'feed' not in response:
1537 raise ExtractorError(u'Got a malformed response from YouTube API')
1538 playlist_title = response['feed']['title']['$t']
1539 if 'entry' not in response['feed']:
1540 # Number of videos is a multiple of self._MAX_RESULTS
1543 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1544 for entry in response['feed']['entry']
1545 if 'content' in entry ]
1547 if len(response['feed']['entry']) < self._MAX_RESULTS:
1551 videos = [v[1] for v in sorted(videos)]
1553 url_results = [self.url_result(url, 'Youtube') for url in videos]
1554 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1557 class YoutubeChannelIE(InfoExtractor):
1558 """Information Extractor for YouTube channels."""
1560 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1561 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1562 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1563 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1564 IE_NAME = u'youtube:channel'
1566 def extract_videos_from_page(self, page):
1568 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1569 if mobj.group(1) not in ids_in_page:
1570 ids_in_page.append(mobj.group(1))
1573 def _real_extract(self, url):
1574 # Extract channel id
1575 mobj = re.match(self._VALID_URL, url)
1577 raise ExtractorError(u'Invalid URL: %s' % url)
1579 # Download channel page
1580 channel_id = mobj.group(1)
1584 url = self._TEMPLATE_URL % (channel_id, pagenum)
1585 page = self._download_webpage(url, channel_id,
1586 u'Downloading page #%s' % pagenum)
1588 # Extract video identifiers
1589 ids_in_page = self.extract_videos_from_page(page)
1590 video_ids.extend(ids_in_page)
1592 # Download any subsequent channel pages using the json-based channel_ajax query
1593 if self._MORE_PAGES_INDICATOR in page:
1595 pagenum = pagenum + 1
1597 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1598 page = self._download_webpage(url, channel_id,
1599 u'Downloading page #%s' % pagenum)
1601 page = json.loads(page)
1603 ids_in_page = self.extract_videos_from_page(page['content_html'])
1604 video_ids.extend(ids_in_page)
1606 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1609 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1611 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1612 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1613 return [self.playlist_result(url_entries, channel_id)]
1616 class YoutubeUserIE(InfoExtractor):
1617 """Information Extractor for YouTube users."""
1619 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1620 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1621 _GDATA_PAGE_SIZE = 50
1622 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1623 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1624 IE_NAME = u'youtube:user'
1626 def _real_extract(self, url):
1628 mobj = re.match(self._VALID_URL, url)
1630 raise ExtractorError(u'Invalid URL: %s' % url)
1632 username = mobj.group(1)
1634 # Download video ids using YouTube Data API. Result size per
1635 # query is limited (currently to 50 videos) so we need to query
1636 # page by page until there are no video ids - it means we got
1643 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1645 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1646 page = self._download_webpage(gdata_url, username,
1647 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1649 # Extract video identifiers
1652 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1653 if mobj.group(1) not in ids_in_page:
1654 ids_in_page.append(mobj.group(1))
1656 video_ids.extend(ids_in_page)
1658 # A little optimization - if current page is not
1659 # "full", ie. does not contain PAGE_SIZE video ids then
1660 # we can assume that this page is the last one - there
1661 # are no more ids on further pages - no need to query
1664 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1669 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1670 url_results = [self.url_result(url, 'Youtube') for url in urls]
1671 return [self.playlist_result(url_results, playlist_title = username)]
1674 class BlipTVUserIE(InfoExtractor):
1675 """Information Extractor for blip.tv users."""
1677 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1679 IE_NAME = u'blip.tv:user'
1681 def _real_extract(self, url):
1683 mobj = re.match(self._VALID_URL, url)
1685 raise ExtractorError(u'Invalid URL: %s' % url)
1687 username = mobj.group(1)
1689 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1691 page = self._download_webpage(url, username, u'Downloading user page')
1692 mobj = re.search(r'data-users-id="([^"]+)"', page)
1693 page_base = page_base % mobj.group(1)
1696 # Download video ids using BlipTV Ajax calls. Result size per
1697 # query is limited (currently to 12 videos) so we need to query
1698 # page by page until there are no video ids - it means we got
1705 url = page_base + "&page=" + str(pagenum)
1706 page = self._download_webpage(url, username,
1707 u'Downloading video ids from page %d' % pagenum)
1709 # Extract video identifiers
1712 for mobj in re.finditer(r'href="/([^"]+)"', page):
1713 if mobj.group(1) not in ids_in_page:
1714 ids_in_page.append(unescapeHTML(mobj.group(1)))
1716 video_ids.extend(ids_in_page)
1718 # A little optimization - if current page is not
1719 # "full", ie. does not contain PAGE_SIZE video ids then
1720 # we can assume that this page is the last one - there
1721 # are no more ids on further pages - no need to query
1724 if len(ids_in_page) < self._PAGE_SIZE:
1729 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1730 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1731 return [self.playlist_result(url_entries, playlist_title = username)]
1734 class DepositFilesIE(InfoExtractor):
1735 """Information extractor for depositfiles.com"""
1737 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1739 def _real_extract(self, url):
1740 file_id = url.split('/')[-1]
1741 # Rebuild url in english locale
1742 url = 'http://depositfiles.com/en/files/' + file_id
1744 # Retrieve file webpage with 'Free download' button pressed
1745 free_download_indication = { 'gateway_result' : '1' }
1746 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1748 self.report_download_webpage(file_id)
1749 webpage = compat_urllib_request.urlopen(request).read()
1750 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1751 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1753 # Search for the real file URL
1754 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1755 if (mobj is None) or (mobj.group(1) is None):
1756 # Try to figure out reason of the error.
1757 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1758 if (mobj is not None) and (mobj.group(1) is not None):
1759 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1760 raise ExtractorError(u'%s' % restriction_message)
1762 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1764 file_url = mobj.group(1)
1765 file_extension = os.path.splitext(file_url)[1][1:]
1767 # Search for file title
1768 mobj = re.search(r'<b title="(.*?)">', webpage)
1770 raise ExtractorError(u'Unable to extract title')
1771 file_title = mobj.group(1).decode('utf-8')
1774 'id': file_id.decode('utf-8'),
1775 'url': file_url.decode('utf-8'),
1777 'upload_date': None,
1778 'title': file_title,
1779 'ext': file_extension.decode('utf-8'),
1783 class FacebookIE(InfoExtractor):
1784 """Information Extractor for Facebook"""
1786 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1787 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1788 _NETRC_MACHINE = 'facebook'
1789 IE_NAME = u'facebook'
1791 def report_login(self):
1792 """Report attempt to log in."""
1793 self.to_screen(u'Logging in')
1795 def _real_initialize(self):
1796 if self._downloader is None:
1801 downloader_params = self._downloader.params
1803 # Attempt to use provided username and password or .netrc data
1804 if downloader_params.get('username', None) is not None:
1805 useremail = downloader_params['username']
1806 password = downloader_params['password']
1807 elif downloader_params.get('usenetrc', False):
1809 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1810 if info is not None:
1814 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1815 except (IOError, netrc.NetrcParseError) as err:
1816 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1819 if useremail is None:
1828 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1831 login_results = compat_urllib_request.urlopen(request).read()
1832 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1833 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1835 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1836 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1839 def _real_extract(self, url):
1840 mobj = re.match(self._VALID_URL, url)
1842 raise ExtractorError(u'Invalid URL: %s' % url)
1843 video_id = mobj.group('ID')
1845 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1846 webpage = self._download_webpage(url, video_id)
1848 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1849 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1850 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1852 raise ExtractorError(u'Cannot parse data')
1853 data = dict(json.loads(m.group(1)))
1854 params_raw = compat_urllib_parse.unquote(data['params'])
1855 params = json.loads(params_raw)
1856 video_data = params['video_data'][0]
1857 video_url = video_data.get('hd_src')
1859 video_url = video_data['sd_src']
1861 raise ExtractorError(u'Cannot find video URL')
1862 video_duration = int(video_data['video_duration'])
1863 thumbnail = video_data['thumbnail_src']
1865 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1867 raise ExtractorError(u'Cannot find title in webpage')
1868 video_title = unescapeHTML(m.group(1))
1872 'title': video_title,
1875 'duration': video_duration,
1876 'thumbnail': thumbnail,
1881 class BlipTVIE(InfoExtractor):
1882 """Information extractor for blip.tv"""
1884 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1885 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1886 IE_NAME = u'blip.tv'
1888 def report_direct_download(self, title):
1889 """Report information extraction."""
1890 self.to_screen(u'%s: Direct download detected' % title)
1892 def _real_extract(self, url):
1893 mobj = re.match(self._VALID_URL, url)
1895 raise ExtractorError(u'Invalid URL: %s' % url)
1897 urlp = compat_urllib_parse_urlparse(url)
1898 if urlp.path.startswith('/play/'):
1899 request = compat_urllib_request.Request(url)
1900 response = compat_urllib_request.urlopen(request)
1901 redirecturl = response.geturl()
1902 rurlp = compat_urllib_parse_urlparse(redirecturl)
1903 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1904 url = 'http://blip.tv/a/a-' + file_id
1905 return self._real_extract(url)
1912 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1913 request = compat_urllib_request.Request(json_url)
1914 request.add_header('User-Agent', 'iTunes/10.6.1')
1915 self.report_extraction(mobj.group(1))
1918 urlh = compat_urllib_request.urlopen(request)
1919 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1920 basename = url.split('/')[-1]
1921 title,ext = os.path.splitext(basename)
1922 title = title.decode('UTF-8')
1923 ext = ext.replace('.', '')
1924 self.report_direct_download(title)
1929 'upload_date': None,
1934 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1935 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1936 if info is None: # Regular URL
1938 json_code_bytes = urlh.read()
1939 json_code = json_code_bytes.decode('utf-8')
1940 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1941 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1944 json_data = json.loads(json_code)
1945 if 'Post' in json_data:
1946 data = json_data['Post']
1950 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1951 video_url = data['media']['url']
1952 umobj = re.match(self._URL_EXT, video_url)
1954 raise ValueError('Can not determine filename extension')
1955 ext = umobj.group(1)
1958 'id': data['item_id'],
1960 'uploader': data['display_name'],
1961 'upload_date': upload_date,
1962 'title': data['title'],
1964 'format': data['media']['mimeType'],
1965 'thumbnail': data['thumbnailUrl'],
1966 'description': data['description'],
1967 'player_url': data['embedUrl'],
1968 'user_agent': 'iTunes/10.6.1',
1970 except (ValueError,KeyError) as err:
1971 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1976 class MyVideoIE(InfoExtractor):
1977 """Information Extractor for myvideo.de."""
1979 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1980 IE_NAME = u'myvideo'
1982 def _real_extract(self,url):
1983 mobj = re.match(self._VALID_URL, url)
1985 raise ExtractorError(u'Invalid URL: %s' % url)
1987 video_id = mobj.group(1)
1990 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1991 webpage = self._download_webpage(webpage_url, video_id)
1993 self.report_extraction(video_id)
1994 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
1997 raise ExtractorError(u'Unable to extract media URL')
1998 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2000 mobj = re.search('<title>([^<]+)</title>', webpage)
2002 raise ExtractorError(u'Unable to extract title')
2004 video_title = mobj.group(1)
2010 'upload_date': None,
2011 'title': video_title,
2015 class ComedyCentralIE(InfoExtractor):
2016 """Information extractor for The Daily Show and Colbert Report """
2018 # urls can be abbreviations like :thedailyshow or :colbert
2019 # urls for episodes like:
2020 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2021 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2022 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2023 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2024 |(https?://)?(www\.)?
2025 (?P<showname>thedailyshow|colbertnation)\.com/
2026 (full-episodes/(?P<episode>.*)|
2028 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2029 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2032 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2034 _video_extensions = {
2042 _video_dimensions = {
2052 def suitable(cls, url):
2053 """Receives a URL and returns True if suitable for this IE."""
2054 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2056 def _print_formats(self, formats):
2057 print('Available formats:')
2059 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2062 def _real_extract(self, url):
2063 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2065 raise ExtractorError(u'Invalid URL: %s' % url)
2067 if mobj.group('shortname'):
2068 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2069 url = u'http://www.thedailyshow.com/full-episodes/'
2071 url = u'http://www.colbertnation.com/full-episodes/'
2072 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2073 assert mobj is not None
2075 if mobj.group('clip'):
2076 if mobj.group('showname') == 'thedailyshow':
2077 epTitle = mobj.group('tdstitle')
2079 epTitle = mobj.group('cntitle')
2082 dlNewest = not mobj.group('episode')
2084 epTitle = mobj.group('showname')
2086 epTitle = mobj.group('episode')
2088 self.report_extraction(epTitle)
2089 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2091 url = htmlHandle.geturl()
2092 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2094 raise ExtractorError(u'Invalid redirected URL: ' + url)
2095 if mobj.group('episode') == '':
2096 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2097 epTitle = mobj.group('episode')
2099 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2101 if len(mMovieParams) == 0:
2102 # The Colbert Report embeds the information in a without
2103 # a URL prefix; so extract the alternate reference
2104 # and then add the URL prefix manually.
2106 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2107 if len(altMovieParams) == 0:
2108 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2110 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2112 uri = mMovieParams[0][1]
2113 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2114 indexXml = self._download_webpage(indexUrl, epTitle,
2115 u'Downloading show index',
2116 u'unable to download episode index')
2120 idoc = xml.etree.ElementTree.fromstring(indexXml)
2121 itemEls = idoc.findall('.//item')
2122 for partNum,itemEl in enumerate(itemEls):
2123 mediaId = itemEl.findall('./guid')[0].text
2124 shortMediaId = mediaId.split(':')[-1]
2125 showId = mediaId.split(':')[-2].replace('.com', '')
2126 officialTitle = itemEl.findall('./title')[0].text
2127 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2129 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2130 compat_urllib_parse.urlencode({'uri': mediaId}))
2131 configXml = self._download_webpage(configUrl, epTitle,
2132 u'Downloading configuration for %s' % shortMediaId)
2134 cdoc = xml.etree.ElementTree.fromstring(configXml)
2136 for rendition in cdoc.findall('.//rendition'):
2137 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2141 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2144 if self._downloader.params.get('listformats', None):
2145 self._print_formats([i[0] for i in turls])
2148 # For now, just pick the highest bitrate
2149 format,rtmp_video_url = turls[-1]
2151 # Get the format arg from the arg stream
2152 req_format = self._downloader.params.get('format', None)
2154 # Select format if we can find one
2157 format, rtmp_video_url = f, v
2160 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2162 raise ExtractorError(u'Cannot transform RTMP url')
2163 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2164 video_url = base + m.group('finalid')
2166 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2171 'upload_date': officialDate,
2176 'description': officialTitle,
2178 results.append(info)
2183 class EscapistIE(InfoExtractor):
2184 """Information extractor for The Escapist """
2186 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2187 IE_NAME = u'escapist'
2189 def _real_extract(self, url):
2190 mobj = re.match(self._VALID_URL, url)
2192 raise ExtractorError(u'Invalid URL: %s' % url)
2193 showName = mobj.group('showname')
2194 videoId = mobj.group('episode')
2196 self.report_extraction(showName)
2197 webPage = self._download_webpage(url, showName)
2199 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2200 description = unescapeHTML(descMatch.group(1))
2201 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2202 imgUrl = unescapeHTML(imgMatch.group(1))
2203 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2204 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2205 configUrlMatch = re.search('config=(.*)$', playerUrl)
2206 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2208 configJSON = self._download_webpage(configUrl, showName,
2209 u'Downloading configuration',
2210 u'unable to download configuration')
2212 # Technically, it's JavaScript, not JSON
2213 configJSON = configJSON.replace("'", '"')
2216 config = json.loads(configJSON)
2217 except (ValueError,) as err:
2218 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2220 playlist = config['playlist']
2221 videoUrl = playlist[1]['url']
2226 'uploader': showName,
2227 'upload_date': None,
2230 'thumbnail': imgUrl,
2231 'description': description,
2232 'player_url': playerUrl,
2237 class CollegeHumorIE(InfoExtractor):
2238 """Information extractor for collegehumor.com"""
2241 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2242 IE_NAME = u'collegehumor'
2244 def report_manifest(self, video_id):
2245 """Report information extraction."""
2246 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2248 def _real_extract(self, url):
2249 mobj = re.match(self._VALID_URL, url)
2251 raise ExtractorError(u'Invalid URL: %s' % url)
2252 video_id = mobj.group('videoid')
2257 'upload_date': None,
2260 self.report_extraction(video_id)
2261 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2263 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2265 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2267 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2269 videoNode = mdoc.findall('./video')[0]
2270 info['description'] = videoNode.findall('./description')[0].text
2271 info['title'] = videoNode.findall('./caption')[0].text
2272 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2273 manifest_url = videoNode.findall('./file')[0].text
2275 raise ExtractorError(u'Invalid metadata XML file')
2277 manifest_url += '?hdcore=2.10.3'
2278 self.report_manifest(video_id)
2280 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2281 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2282 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2284 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2286 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2287 node_id = media_node.attrib['url']
2288 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2289 except IndexError as err:
2290 raise ExtractorError(u'Invalid manifest file')
2292 url_pr = compat_urllib_parse_urlparse(manifest_url)
2293 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2300 class XVideosIE(InfoExtractor):
2301 """Information extractor for xvideos.com"""
2303 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2304 IE_NAME = u'xvideos'
2306 def _real_extract(self, url):
2307 mobj = re.match(self._VALID_URL, url)
2309 raise ExtractorError(u'Invalid URL: %s' % url)
2310 video_id = mobj.group(1)
2312 webpage = self._download_webpage(url, video_id)
2314 self.report_extraction(video_id)
2318 mobj = re.search(r'flv_url=(.+?)&', webpage)
2320 raise ExtractorError(u'Unable to extract video url')
2321 video_url = compat_urllib_parse.unquote(mobj.group(1))
2325 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2327 raise ExtractorError(u'Unable to extract video title')
2328 video_title = mobj.group(1)
2331 # Extract video thumbnail
2332 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2334 raise ExtractorError(u'Unable to extract video thumbnail')
2335 video_thumbnail = mobj.group(0)
2341 'upload_date': None,
2342 'title': video_title,
2344 'thumbnail': video_thumbnail,
2345 'description': None,
2351 class SoundcloudIE(InfoExtractor):
2352 """Information extractor for soundcloud.com
2353 To access the media, the uid of the song and a stream token
2354 must be extracted from the page source and the script must make
2355 a request to media.soundcloud.com/crossdomain.xml. Then
2356 the media can be grabbed by requesting from an url composed
2357 of the stream token and uid
2360 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2361 IE_NAME = u'soundcloud'
2363 def report_resolve(self, video_id):
2364 """Report information extraction."""
2365 self.to_screen(u'%s: Resolving id' % video_id)
2367 def _real_extract(self, url):
2368 mobj = re.match(self._VALID_URL, url)
2370 raise ExtractorError(u'Invalid URL: %s' % url)
2372 # extract uploader (which is in the url)
2373 uploader = mobj.group(1)
2374 # extract simple title (uploader + slug of song title)
2375 slug_title = mobj.group(2)
2376 simple_title = uploader + u'-' + slug_title
2377 full_title = '%s/%s' % (uploader, slug_title)
2379 self.report_resolve(full_title)
2381 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2382 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2383 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2385 info = json.loads(info_json)
2386 video_id = info['id']
2387 self.report_extraction(full_title)
2389 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2390 stream_json = self._download_webpage(streams_url, full_title,
2391 u'Downloading stream definitions',
2392 u'unable to download stream definitions')
2394 streams = json.loads(stream_json)
2395 mediaURL = streams['http_mp3_128_url']
2396 upload_date = unified_strdate(info['created_at'])
2401 'uploader': info['user']['username'],
2402 'upload_date': upload_date,
2403 'title': info['title'],
2405 'description': info['description'],
2408 class SoundcloudSetIE(InfoExtractor):
2409 """Information extractor for soundcloud.com sets
2410 To access the media, the uid of the song and a stream token
2411 must be extracted from the page source and the script must make
2412 a request to media.soundcloud.com/crossdomain.xml. Then
2413 the media can be grabbed by requesting from an url composed
2414 of the stream token and uid
2417 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2418 IE_NAME = u'soundcloud:set'
2420 def report_resolve(self, video_id):
2421 """Report information extraction."""
2422 self.to_screen(u'%s: Resolving id' % video_id)
2424 def _real_extract(self, url):
2425 mobj = re.match(self._VALID_URL, url)
2427 raise ExtractorError(u'Invalid URL: %s' % url)
2429 # extract uploader (which is in the url)
2430 uploader = mobj.group(1)
2431 # extract simple title (uploader + slug of song title)
2432 slug_title = mobj.group(2)
2433 simple_title = uploader + u'-' + slug_title
2434 full_title = '%s/sets/%s' % (uploader, slug_title)
2436 self.report_resolve(full_title)
2438 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2439 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2440 info_json = self._download_webpage(resolv_url, full_title)
2443 info = json.loads(info_json)
2444 if 'errors' in info:
2445 for err in info['errors']:
2446 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2449 self.report_extraction(full_title)
2450 for track in info['tracks']:
2451 video_id = track['id']
2453 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2454 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2456 self.report_extraction(video_id)
2457 streams = json.loads(stream_json)
2458 mediaURL = streams['http_mp3_128_url']
2463 'uploader': track['user']['username'],
2464 'upload_date': unified_strdate(track['created_at']),
2465 'title': track['title'],
2467 'description': track['description'],
2472 class InfoQIE(InfoExtractor):
2473 """Information extractor for infoq.com"""
2474 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2476 def _real_extract(self, url):
2477 mobj = re.match(self._VALID_URL, url)
2479 raise ExtractorError(u'Invalid URL: %s' % url)
2481 webpage = self._download_webpage(url, video_id=url)
2482 self.report_extraction(url)
2485 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2487 raise ExtractorError(u'Unable to extract video url')
2488 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2489 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2492 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2494 raise ExtractorError(u'Unable to extract video title')
2495 video_title = mobj.group(1)
2497 # Extract description
2498 video_description = u'No description available.'
2499 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2500 if mobj is not None:
2501 video_description = mobj.group(1)
2503 video_filename = video_url.split('/')[-1]
2504 video_id, extension = video_filename.split('.')
2510 'upload_date': None,
2511 'title': video_title,
2512 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2514 'description': video_description,
2519 class MixcloudIE(InfoExtractor):
2520 """Information extractor for www.mixcloud.com"""
2522 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2523 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2524 IE_NAME = u'mixcloud'
2526 def report_download_json(self, file_id):
2527 """Report JSON download."""
2528 self.to_screen(u'Downloading json')
2530 def get_urls(self, jsonData, fmt, bitrate='best'):
2531 """Get urls from 'audio_formats' section in json"""
2534 bitrate_list = jsonData[fmt]
2535 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2536 bitrate = max(bitrate_list) # select highest
2538 url_list = jsonData[fmt][bitrate]
2539 except TypeError: # we have no bitrate info.
2540 url_list = jsonData[fmt]
2543 def check_urls(self, url_list):
2544 """Returns 1st active url from list"""
2545 for url in url_list:
2547 compat_urllib_request.urlopen(url)
2549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2554 def _print_formats(self, formats):
2555 print('Available formats:')
2556 for fmt in formats.keys():
2557 for b in formats[fmt]:
2559 ext = formats[fmt][b][0]
2560 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2561 except TypeError: # we have no bitrate info
2562 ext = formats[fmt][0]
2563 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2566 def _real_extract(self, url):
2567 mobj = re.match(self._VALID_URL, url)
2569 raise ExtractorError(u'Invalid URL: %s' % url)
2570 # extract uploader & filename from url
2571 uploader = mobj.group(1).decode('utf-8')
2572 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2574 # construct API request
2575 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2576 # retrieve .json file with links to files
2577 request = compat_urllib_request.Request(file_url)
2579 self.report_download_json(file_url)
2580 jsonData = compat_urllib_request.urlopen(request).read()
2581 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2582 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2585 json_data = json.loads(jsonData)
2586 player_url = json_data['player_swf_url']
2587 formats = dict(json_data['audio_formats'])
2589 req_format = self._downloader.params.get('format', None)
2592 if self._downloader.params.get('listformats', None):
2593 self._print_formats(formats)
2596 if req_format is None or req_format == 'best':
2597 for format_param in formats.keys():
2598 url_list = self.get_urls(formats, format_param)
2600 file_url = self.check_urls(url_list)
2601 if file_url is not None:
2604 if req_format not in formats:
2605 raise ExtractorError(u'Format is not available')
2607 url_list = self.get_urls(formats, req_format)
2608 file_url = self.check_urls(url_list)
2609 format_param = req_format
2612 'id': file_id.decode('utf-8'),
2613 'url': file_url.decode('utf-8'),
2614 'uploader': uploader.decode('utf-8'),
2615 'upload_date': None,
2616 'title': json_data['name'],
2617 'ext': file_url.split('.')[-1].decode('utf-8'),
2618 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2619 'thumbnail': json_data['thumbnail_url'],
2620 'description': json_data['description'],
2621 'player_url': player_url.decode('utf-8'),
2624 class StanfordOpenClassroomIE(InfoExtractor):
2625 """Information extractor for Stanford's Open ClassRoom"""
2627 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2628 IE_NAME = u'stanfordoc'
2630 def _real_extract(self, url):
2631 mobj = re.match(self._VALID_URL, url)
2633 raise ExtractorError(u'Invalid URL: %s' % url)
2635 if mobj.group('course') and mobj.group('video'): # A specific video
2636 course = mobj.group('course')
2637 video = mobj.group('video')
2639 'id': course + '_' + video,
2641 'upload_date': None,
2644 self.report_extraction(info['id'])
2645 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2646 xmlUrl = baseUrl + video + '.xml'
2648 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2649 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2650 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2651 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2653 info['title'] = mdoc.findall('./title')[0].text
2654 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2656 raise ExtractorError(u'Invalid metadata XML file')
2657 info['ext'] = info['url'].rpartition('.')[2]
2659 elif mobj.group('course'): # A course page
2660 course = mobj.group('course')
2665 'upload_date': None,
2668 coursepage = self._download_webpage(url, info['id'],
2669 note='Downloading course info page',
2670 errnote='Unable to download course info page')
2672 m = re.search('<h1>([^<]+)</h1>', coursepage)
2674 info['title'] = unescapeHTML(m.group(1))
2676 info['title'] = info['id']
2678 m = re.search('<description>([^<]+)</description>', coursepage)
2680 info['description'] = unescapeHTML(m.group(1))
2682 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2685 'type': 'reference',
2686 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2690 for entry in info['list']:
2691 assert entry['type'] == 'reference'
2692 results += self.extract(entry['url'])
2696 'id': 'Stanford OpenClassroom',
2699 'upload_date': None,
2702 self.report_download_webpage(info['id'])
2703 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2705 rootpage = compat_urllib_request.urlopen(rootURL).read()
2706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2709 info['title'] = info['id']
2711 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2714 'type': 'reference',
2715 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2720 for entry in info['list']:
2721 assert entry['type'] == 'reference'
2722 results += self.extract(entry['url'])
2725 class MTVIE(InfoExtractor):
2726 """Information extractor for MTV.com"""
2728 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2731 def _real_extract(self, url):
2732 mobj = re.match(self._VALID_URL, url)
2734 raise ExtractorError(u'Invalid URL: %s' % url)
2735 if not mobj.group('proto'):
2736 url = 'http://' + url
2737 video_id = mobj.group('videoid')
2739 webpage = self._download_webpage(url, video_id)
2741 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2743 raise ExtractorError(u'Unable to extract song name')
2744 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2745 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2747 raise ExtractorError(u'Unable to extract performer')
2748 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2749 video_title = performer + ' - ' + song_name
2751 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2753 raise ExtractorError(u'Unable to mtvn_uri')
2754 mtvn_uri = mobj.group(1)
2756 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2758 raise ExtractorError(u'Unable to extract content id')
2759 content_id = mobj.group(1)
2761 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2762 self.report_extraction(video_id)
2763 request = compat_urllib_request.Request(videogen_url)
2765 metadataXml = compat_urllib_request.urlopen(request).read()
2766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2767 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2769 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2770 renditions = mdoc.findall('.//rendition')
2772 # For now, always pick the highest quality.
2773 rendition = renditions[-1]
2776 _,_,ext = rendition.attrib['type'].partition('/')
2777 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2778 video_url = rendition.find('./src').text
2780 raise ExtractorError('Invalid rendition field.')
2785 'uploader': performer,
2786 'upload_date': None,
2787 'title': video_title,
2795 class YoukuIE(InfoExtractor):
2796 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2799 nowTime = int(time.time() * 1000)
2800 random1 = random.randint(1000,1998)
2801 random2 = random.randint(1000,9999)
2803 return "%d%d%d" %(nowTime,random1,random2)
2805 def _get_file_ID_mix_string(self, seed):
2807 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2809 for i in range(len(source)):
2810 seed = (seed * 211 + 30031 ) % 65536
2811 index = math.floor(seed / 65536 * len(source) )
2812 mixed.append(source[int(index)])
2813 source.remove(source[int(index)])
2814 #return ''.join(mixed)
2817 def _get_file_id(self, fileId, seed):
2818 mixed = self._get_file_ID_mix_string(seed)
2819 ids = fileId.split('*')
2823 realId.append(mixed[int(ch)])
2824 return ''.join(realId)
2826 def _real_extract(self, url):
2827 mobj = re.match(self._VALID_URL, url)
2829 raise ExtractorError(u'Invalid URL: %s' % url)
2830 video_id = mobj.group('ID')
2832 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2834 jsondata = self._download_webpage(info_url, video_id)
2836 self.report_extraction(video_id)
2838 config = json.loads(jsondata)
2840 video_title = config['data'][0]['title']
2841 seed = config['data'][0]['seed']
2843 format = self._downloader.params.get('format', None)
2844 supported_format = list(config['data'][0]['streamfileids'].keys())
2846 if format is None or format == 'best':
2847 if 'hd2' in supported_format:
2852 elif format == 'worst':
2860 fileid = config['data'][0]['streamfileids'][format]
2861 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2862 except (UnicodeDecodeError, ValueError, KeyError):
2863 raise ExtractorError(u'Unable to extract info section')
2866 sid = self._gen_sid()
2867 fileid = self._get_file_id(fileid, seed)
2869 #column 8,9 of fileid represent the segment number
2870 #fileid[7:9] should be changed
2871 for index, key in enumerate(keys):
2873 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2874 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2877 'id': '%s_part%02d' % (video_id, index),
2878 'url': download_url,
2880 'upload_date': None,
2881 'title': video_title,
2884 files_info.append(info)
2889 class XNXXIE(InfoExtractor):
2890 """Information extractor for xnxx.com"""
2892 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2894 VIDEO_URL_RE = r'flv_url=(.*?)&'
2895 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2896 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2898 def _real_extract(self, url):
2899 mobj = re.match(self._VALID_URL, url)
2901 raise ExtractorError(u'Invalid URL: %s' % url)
2902 video_id = mobj.group(1)
2904 # Get webpage content
2905 webpage = self._download_webpage(url, video_id)
2907 result = re.search(self.VIDEO_URL_RE, webpage)
2909 raise ExtractorError(u'Unable to extract video url')
2910 video_url = compat_urllib_parse.unquote(result.group(1))
2912 result = re.search(self.VIDEO_TITLE_RE, webpage)
2914 raise ExtractorError(u'Unable to extract video title')
2915 video_title = result.group(1)
2917 result = re.search(self.VIDEO_THUMB_RE, webpage)
2919 raise ExtractorError(u'Unable to extract video thumbnail')
2920 video_thumbnail = result.group(1)
2926 'upload_date': None,
2927 'title': video_title,
2929 'thumbnail': video_thumbnail,
2930 'description': None,
2934 class GooglePlusIE(InfoExtractor):
2935 """Information extractor for plus.google.com."""
2937 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2938 IE_NAME = u'plus.google'
2940 def report_extract_entry(self, url):
2941 """Report downloading extry"""
2942 self.to_screen(u'Downloading entry: %s' % url)
2944 def report_date(self, upload_date):
2945 """Report downloading extry"""
2946 self.to_screen(u'Entry date: %s' % upload_date)
2948 def report_uploader(self, uploader):
2949 """Report downloading extry"""
2950 self.to_screen(u'Uploader: %s' % uploader)
2952 def report_title(self, video_title):
2953 """Report downloading extry"""
2954 self.to_screen(u'Title: %s' % video_title)
2956 def report_extract_vid_page(self, video_page):
2957 """Report information extraction."""
2958 self.to_screen(u'Extracting video page: %s' % video_page)
2960 def _real_extract(self, url):
2961 # Extract id from URL
2962 mobj = re.match(self._VALID_URL, url)
2964 raise ExtractorError(u'Invalid URL: %s' % url)
2966 post_url = mobj.group(0)
2967 video_id = mobj.group(1)
2969 video_extension = 'flv'
2971 # Step 1, Retrieve post webpage to extract further information
2972 self.report_extract_entry(post_url)
2973 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2975 # Extract update date
2977 pattern = 'title="Timestamp">(.*?)</a>'
2978 mobj = re.search(pattern, webpage)
2980 upload_date = mobj.group(1)
2981 # Convert timestring to a format suitable for filename
2982 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2983 upload_date = upload_date.strftime('%Y%m%d')
2984 self.report_date(upload_date)
2988 pattern = r'rel\="author".*?>(.*?)</a>'
2989 mobj = re.search(pattern, webpage)
2991 uploader = mobj.group(1)
2992 self.report_uploader(uploader)
2995 # Get the first line for title
2997 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
2998 mobj = re.search(pattern, webpage)
3000 video_title = mobj.group(1)
3001 self.report_title(video_title)
3003 # Step 2, Stimulate clicking the image box to launch video
3004 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3005 mobj = re.search(pattern, webpage)
3007 raise ExtractorError(u'Unable to extract video page URL')
3009 video_page = mobj.group(1)
3010 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3011 self.report_extract_vid_page(video_page)
3014 # Extract video links on video page
3015 """Extract video links of all sizes"""
3016 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3017 mobj = re.findall(pattern, webpage)
3019 raise ExtractorError(u'Unable to extract video links')
3021 # Sort in resolution
3022 links = sorted(mobj)
3024 # Choose the lowest of the sort, i.e. highest resolution
3025 video_url = links[-1]
3026 # Only get the url. The resolution part in the tuple has no use anymore
3027 video_url = video_url[-1]
3028 # Treat escaped \u0026 style hex
3030 video_url = video_url.decode("unicode_escape")
3031 except AttributeError: # Python 3
3032 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3038 'uploader': uploader,
3039 'upload_date': upload_date,
3040 'title': video_title,
3041 'ext': video_extension,
3044 class NBAIE(InfoExtractor):
3045 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3048 def _real_extract(self, url):
3049 mobj = re.match(self._VALID_URL, url)
3051 raise ExtractorError(u'Invalid URL: %s' % url)
3053 video_id = mobj.group(1)
3054 if video_id.endswith('/index.html'):
3055 video_id = video_id[:-len('/index.html')]
3057 webpage = self._download_webpage(url, video_id)
3059 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3060 def _findProp(rexp, default=None):
3061 m = re.search(rexp, webpage)
3063 return unescapeHTML(m.group(1))
3067 shortened_video_id = video_id.rpartition('/')[2]
3068 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3070 'id': shortened_video_id,
3074 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3075 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3079 class JustinTVIE(InfoExtractor):
3080 """Information extractor for justin.tv and twitch.tv"""
3081 # TODO: One broadcast may be split into multiple videos. The key
3082 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3083 # starts at 1 and increases. Can we treat all parts as one video?
3085 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3087 (?P<channelid>[^/]+)|
3088 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3089 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3093 _JUSTIN_PAGE_LIMIT = 100
3094 IE_NAME = u'justin.tv'
3096 def report_download_page(self, channel, offset):
3097 """Report attempt to download a single page of videos."""
3098 self.to_screen(u'%s: Downloading video information from %d to %d' %
3099 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3101 # Return count of items, list of *valid* items
3102 def _parse_page(self, url, video_id):
3103 webpage = self._download_webpage(url, video_id,
3104 u'Downloading video info JSON',
3105 u'unable to download video info JSON')
3107 response = json.loads(webpage)
3108 if type(response) != list:
3109 error_text = response.get('error', 'unknown error')
3110 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3112 for clip in response:
3113 video_url = clip['video_file_url']
3115 video_extension = os.path.splitext(video_url)[1][1:]
3116 video_date = re.sub('-', '', clip['start_time'][:10])
3117 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3118 video_id = clip['id']
3119 video_title = clip.get('title', video_id)
3123 'title': video_title,
3124 'uploader': clip.get('channel_name', video_uploader_id),
3125 'uploader_id': video_uploader_id,
3126 'upload_date': video_date,
3127 'ext': video_extension,
3129 return (len(response), info)
3131 def _real_extract(self, url):
3132 mobj = re.match(self._VALID_URL, url)
3134 raise ExtractorError(u'invalid URL: %s' % url)
3136 api_base = 'http://api.justin.tv'
3138 if mobj.group('channelid'):
3140 video_id = mobj.group('channelid')
3141 api = api_base + '/channel/archives/%s.json' % video_id
3142 elif mobj.group('chapterid'):
3143 chapter_id = mobj.group('chapterid')
3145 webpage = self._download_webpage(url, chapter_id)
3146 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3148 raise ExtractorError(u'Cannot find archive of a chapter')
3149 archive_id = m.group(1)
3151 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3152 chapter_info_xml = self._download_webpage(api, chapter_id,
3153 note=u'Downloading chapter information',
3154 errnote=u'Chapter information download failed')
3155 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3156 for a in doc.findall('.//archive'):
3157 if archive_id == a.find('./id').text:
3160 raise ExtractorError(u'Could not find chapter in chapter information')
3162 video_url = a.find('./video_file_url').text
3163 video_ext = video_url.rpartition('.')[2] or u'flv'
3165 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3166 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3167 note='Downloading chapter metadata',
3168 errnote='Download of chapter metadata failed')
3169 chapter_info = json.loads(chapter_info_json)
3171 bracket_start = int(doc.find('.//bracket_start').text)
3172 bracket_end = int(doc.find('.//bracket_end').text)
3174 # TODO determine start (and probably fix up file)
3175 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3176 #video_url += u'?start=' + TODO:start_timestamp
3177 # bracket_start is 13290, but we want 51670615
3178 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3179 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3182 'id': u'c' + chapter_id,
3185 'title': chapter_info['title'],
3186 'thumbnail': chapter_info['preview'],
3187 'description': chapter_info['description'],
3188 'uploader': chapter_info['channel']['display_name'],
3189 'uploader_id': chapter_info['channel']['name'],
3193 video_id = mobj.group('videoid')
3194 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3196 self.report_extraction(video_id)
3200 limit = self._JUSTIN_PAGE_LIMIT
3203 self.report_download_page(video_id, offset)
3204 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3205 page_count, page_info = self._parse_page(page_url, video_id)
3206 info.extend(page_info)
3207 if not paged or page_count != limit:
3212 class FunnyOrDieIE(InfoExtractor):
3213 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3215 def _real_extract(self, url):
3216 mobj = re.match(self._VALID_URL, url)
3218 raise ExtractorError(u'invalid URL: %s' % url)
3220 video_id = mobj.group('id')
3221 webpage = self._download_webpage(url, video_id)
3223 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3225 raise ExtractorError(u'Unable to find video information')
3226 video_url = unescapeHTML(m.group('url'))
3228 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3230 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3232 raise ExtractorError(u'Cannot find video title')
3233 title = clean_html(m.group('title'))
3235 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3237 desc = unescapeHTML(m.group('desc'))
3246 'description': desc,
3250 class SteamIE(InfoExtractor):
3251 _VALID_URL = r"""http://store\.steampowered\.com/
3253 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3255 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3259 def suitable(cls, url):
3260 """Receives a URL and returns True if suitable for this IE."""
3261 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3263 def _real_extract(self, url):
3264 m = re.match(self._VALID_URL, url, re.VERBOSE)
3265 gameID = m.group('gameID')
3266 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3267 self.report_age_confirmation()
3268 webpage = self._download_webpage(videourl, gameID)
3269 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3271 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3272 mweb = re.finditer(urlRE, webpage)
3273 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3274 titles = re.finditer(namesRE, webpage)
3275 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3276 thumbs = re.finditer(thumbsRE, webpage)
3278 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3279 video_id = vid.group('videoID')
3280 title = vtitle.group('videoName')
3281 video_url = vid.group('videoURL')
3282 video_thumb = thumb.group('thumbnail')
3284 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3289 'title': unescapeHTML(title),
3290 'thumbnail': video_thumb
3293 return [self.playlist_result(videos, gameID, game_title)]
3295 class UstreamIE(InfoExtractor):
3296 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3297 IE_NAME = u'ustream'
3299 def _real_extract(self, url):
3300 m = re.match(self._VALID_URL, url)
3301 video_id = m.group('videoID')
3302 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3303 webpage = self._download_webpage(url, video_id)
3304 self.report_extraction(video_id)
3306 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3307 title = m.group('title')
3308 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3310 uploader = unescapeHTML(m.group('uploader').strip())
3311 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3312 thumb = m.group('thumb')
3313 except AttributeError:
3314 raise ExtractorError(u'Unable to extract info')
3320 'uploader': uploader,
3325 class WorldStarHipHopIE(InfoExtractor):
3326 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3327 IE_NAME = u'WorldStarHipHop'
3329 def _real_extract(self, url):
3330 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3332 m = re.match(self._VALID_URL, url)
3333 video_id = m.group('id')
3335 webpage_src = self._download_webpage(url, video_id)
3337 mobj = re.search(_src_url, webpage_src)
3339 if mobj is not None:
3340 video_url = mobj.group(1)
3341 if 'mp4' in video_url:
3346 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3348 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3351 raise ExtractorError(u'Cannot determine title')
3352 title = mobj.group(1)
3354 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3355 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3356 if mobj is not None:
3357 thumbnail = mobj.group(1)
3359 _title = r"""candytitles.*>(.*)</span>"""
3360 mobj = re.search(_title, webpage_src)
3361 if mobj is not None:
3362 title = mobj.group(1)
3369 'thumbnail' : thumbnail,
3374 class RBMARadioIE(InfoExtractor):
3375 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3377 def _real_extract(self, url):
3378 m = re.match(self._VALID_URL, url)
3379 video_id = m.group('videoID')
3381 webpage = self._download_webpage(url, video_id)
3382 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3384 raise ExtractorError(u'Cannot find metadata')
3385 json_data = m.group(1)
3388 data = json.loads(json_data)
3389 except ValueError as e:
3390 raise ExtractorError(u'Invalid JSON: ' + str(e))
3392 video_url = data['akamai_url'] + '&cbr=256'
3393 url_parts = compat_urllib_parse_urlparse(video_url)
3394 video_ext = url_parts.path.rpartition('.')[2]
3399 'title': data['title'],
3400 'description': data.get('teaser_text'),
3401 'location': data.get('country_of_origin'),
3402 'uploader': data.get('host', {}).get('name'),
3403 'uploader_id': data.get('host', {}).get('slug'),
3404 'thumbnail': data.get('image', {}).get('large_url_2x'),
3405 'duration': data.get('duration'),
3410 class YouPornIE(InfoExtractor):
3411 """Information extractor for youporn.com."""
3412 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3414 def _print_formats(self, formats):
3415 """Print all available formats"""
3416 print(u'Available formats:')
3417 print(u'ext\t\tformat')
3418 print(u'---------------------------------')
3419 for format in formats:
3420 print(u'%s\t\t%s' % (format['ext'], format['format']))
3422 def _specific(self, req_format, formats):
3424 if(x["format"]==req_format):
3428 def _real_extract(self, url):
3429 mobj = re.match(self._VALID_URL, url)
3431 raise ExtractorError(u'Invalid URL: %s' % url)
3433 video_id = mobj.group('videoid')
3435 req = compat_urllib_request.Request(url)
3436 req.add_header('Cookie', 'age_verified=1')
3437 webpage = self._download_webpage(req, video_id)
3439 # Get the video title
3440 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3442 raise ExtractorError(u'Unable to extract video title')
3443 video_title = result.group('title').strip()
3445 # Get the video date
3446 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3448 self._downloader.report_warning(u'unable to extract video date')
3451 upload_date = unified_strdate(result.group('date').strip())
3453 # Get the video uploader
3454 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3456 self._downloader.report_warning(u'unable to extract uploader')
3457 video_uploader = None
3459 video_uploader = result.group('uploader').strip()
3460 video_uploader = clean_html( video_uploader )
3462 # Get all of the formats available
3463 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3464 result = re.search(DOWNLOAD_LIST_RE, webpage)
3466 raise ExtractorError(u'Unable to extract download list')
3467 download_list_html = result.group('download_list').strip()
3469 # Get all of the links from the page
3470 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3471 links = re.findall(LINK_RE, download_list_html)
3472 if(len(links) == 0):
3473 raise ExtractorError(u'ERROR: no known formats available for video')
3475 self.to_screen(u'Links found: %d' % len(links))
3480 # A link looks like this:
3481 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3482 # A path looks like this:
3483 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3484 video_url = unescapeHTML( link )
3485 path = compat_urllib_parse_urlparse( video_url ).path
3486 extension = os.path.splitext( path )[1][1:]
3487 format = path.split('/')[4].split('_')[:2]
3490 format = "-".join( format )
3491 title = u'%s-%s-%s' % (video_title, size, bitrate)
3496 'uploader': video_uploader,
3497 'upload_date': upload_date,
3502 'description': None,
3506 if self._downloader.params.get('listformats', None):
3507 self._print_formats(formats)
3510 req_format = self._downloader.params.get('format', None)
3511 self.to_screen(u'Format: %s' % req_format)
3513 if req_format is None or req_format == 'best':
3515 elif req_format == 'worst':
3516 return [formats[-1]]
3517 elif req_format in ('-1', 'all'):
3520 format = self._specific( req_format, formats )
3522 raise ExtractorError(u'Requested format not available')
3527 class PornotubeIE(InfoExtractor):
3528 """Information extractor for pornotube.com."""
3529 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3531 def _real_extract(self, url):
3532 mobj = re.match(self._VALID_URL, url)
3534 raise ExtractorError(u'Invalid URL: %s' % url)
3536 video_id = mobj.group('videoid')
3537 video_title = mobj.group('title')
3539 # Get webpage content
3540 webpage = self._download_webpage(url, video_id)
3543 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3544 result = re.search(VIDEO_URL_RE, webpage)
3546 raise ExtractorError(u'Unable to extract video url')
3547 video_url = compat_urllib_parse.unquote(result.group('url'))
3549 #Get the uploaded date
3550 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3551 result = re.search(VIDEO_UPLOADED_RE, webpage)
3553 raise ExtractorError(u'Unable to extract video title')
3554 upload_date = unified_strdate(result.group('date'))
3556 info = {'id': video_id,
3559 'upload_date': upload_date,
3560 'title': video_title,
3566 class YouJizzIE(InfoExtractor):
3567 """Information extractor for youjizz.com."""
3568 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3570 def _real_extract(self, url):
3571 mobj = re.match(self._VALID_URL, url)
3573 raise ExtractorError(u'Invalid URL: %s' % url)
3575 video_id = mobj.group('videoid')
3577 # Get webpage content
3578 webpage = self._download_webpage(url, video_id)
3580 # Get the video title
3581 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3583 raise ExtractorError(u'ERROR: unable to extract video title')
3584 video_title = result.group('title').strip()
3586 # Get the embed page
3587 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3589 raise ExtractorError(u'ERROR: unable to extract embed page')
3591 embed_page_url = result.group(0).strip()
3592 video_id = result.group('videoid')
3594 webpage = self._download_webpage(embed_page_url, video_id)
3597 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3599 raise ExtractorError(u'ERROR: unable to extract video url')
3600 video_url = result.group('source')
3602 info = {'id': video_id,
3604 'title': video_title,
3607 'player_url': embed_page_url}
3611 class EightTracksIE(InfoExtractor):
3613 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3615 def _real_extract(self, url):
3616 mobj = re.match(self._VALID_URL, url)
3618 raise ExtractorError(u'Invalid URL: %s' % url)
3619 playlist_id = mobj.group('id')
3621 webpage = self._download_webpage(url, playlist_id)
3623 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3625 raise ExtractorError(u'Cannot find trax information')
3626 json_like = m.group(1)
3627 data = json.loads(json_like)
3629 session = str(random.randint(0, 1000000000))
3631 track_count = data['tracks_count']
3632 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3633 next_url = first_url
3635 for i in itertools.count():
3636 api_json = self._download_webpage(next_url, playlist_id,
3637 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3638 errnote=u'Failed to download song information')
3639 api_data = json.loads(api_json)
3640 track_data = api_data[u'set']['track']
3642 'id': track_data['id'],
3643 'url': track_data['track_file_stream_url'],
3644 'title': track_data['performer'] + u' - ' + track_data['name'],
3645 'raw_title': track_data['name'],
3646 'uploader_id': data['user']['login'],
3650 if api_data['set']['at_last_track']:
3652 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3655 class KeekIE(InfoExtractor):
3656 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3659 def _real_extract(self, url):
3660 m = re.match(self._VALID_URL, url)
3661 video_id = m.group('videoID')
3662 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3663 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3664 webpage = self._download_webpage(url, video_id)
3665 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3666 title = unescapeHTML(m.group('title'))
3667 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3668 uploader = clean_html(m.group('uploader'))
3674 'thumbnail': thumbnail,
3675 'uploader': uploader
3679 class TEDIE(InfoExtractor):
3680 _VALID_URL=r'''http://www\.ted\.com/
3682 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3684 ((?P<type_talk>talks)) # We have a simple talk
3686 (/lang/(.*?))? # The url may contain the language
3687 /(?P<name>\w+) # Here goes the name and then ".html"
3691 def suitable(cls, url):
3692 """Receives a URL and returns True if suitable for this IE."""
3693 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3695 def _real_extract(self, url):
3696 m=re.match(self._VALID_URL, url, re.VERBOSE)
3697 if m.group('type_talk'):
3698 return [self._talk_info(url)]
3700 playlist_id=m.group('playlist_id')
3701 name=m.group('name')
3702 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3703 return [self._playlist_videos_info(url,name,playlist_id)]
3705 def _talk_video_link(self,mediaSlug):
3706 '''Returns the video link for that mediaSlug'''
3707 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3709 def _playlist_videos_info(self,url,name,playlist_id=0):
3710 '''Returns the videos of the playlist'''
3712 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3713 ([.\s]*?)data-playlist_item_id="(\d+)"
3714 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3716 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3717 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3718 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3719 m_names=re.finditer(video_name_RE,webpage)
3721 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3722 m_playlist = re.search(playlist_RE, webpage)
3723 playlist_title = m_playlist.group('playlist_title')
3725 playlist_entries = []
3726 for m_video, m_name in zip(m_videos,m_names):
3727 video_id=m_video.group('video_id')
3728 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3729 playlist_entries.append(self.url_result(talk_url, 'TED'))
3730 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3732 def _talk_info(self, url, video_id=0):
3733 """Return the video for the talk in the url"""
3734 m=re.match(self._VALID_URL, url,re.VERBOSE)
3735 videoName=m.group('name')
3736 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3737 # If the url includes the language we get the title translated
3738 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3739 title=re.search(title_RE, webpage).group('title')
3740 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3741 "id":(?P<videoID>[\d]+).*?
3742 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3743 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3744 thumb_match=re.search(thumb_RE,webpage)
3745 info_match=re.search(info_RE,webpage,re.VERBOSE)
3746 video_id=info_match.group('videoID')
3747 mediaSlug=info_match.group('mediaSlug')
3748 video_url=self._talk_video_link(mediaSlug)
3754 'thumbnail': thumb_match.group('thumbnail')
3758 class MySpassIE(InfoExtractor):
3759 _VALID_URL = r'http://www.myspass.de/.*'
3761 def _real_extract(self, url):
3762 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3764 # video id is the last path element of the URL
3765 # usually there is a trailing slash, so also try the second but last
3766 url_path = compat_urllib_parse_urlparse(url).path
3767 url_parent_path, video_id = os.path.split(url_path)
3769 _, video_id = os.path.split(url_parent_path)
3772 metadata_url = META_DATA_URL_TEMPLATE % video_id
3773 metadata_text = self._download_webpage(metadata_url, video_id)
3774 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3776 # extract values from metadata
3777 url_flv_el = metadata.find('url_flv')
3778 if url_flv_el is None:
3779 raise ExtractorError(u'Unable to extract download url')
3780 video_url = url_flv_el.text
3781 extension = os.path.splitext(video_url)[1][1:]
3782 title_el = metadata.find('title')
3783 if title_el is None:
3784 raise ExtractorError(u'Unable to extract title')
3785 title = title_el.text
3786 format_id_el = metadata.find('format_id')
3787 if format_id_el is None:
3790 format = format_id_el.text
3791 description_el = metadata.find('description')
3792 if description_el is not None:
3793 description = description_el.text
3796 imagePreview_el = metadata.find('imagePreview')
3797 if imagePreview_el is not None:
3798 thumbnail = imagePreview_el.text
3807 'thumbnail': thumbnail,
3808 'description': description
3812 class SpiegelIE(InfoExtractor):
3813 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3815 def _real_extract(self, url):
3816 m = re.match(self._VALID_URL, url)
3817 video_id = m.group('videoID')
3819 webpage = self._download_webpage(url, video_id)
3820 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3822 raise ExtractorError(u'Cannot find title')
3823 video_title = unescapeHTML(m.group(1))
3825 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3826 xml_code = self._download_webpage(xml_url, video_id,
3827 note=u'Downloading XML', errnote=u'Failed to download XML')
3829 idoc = xml.etree.ElementTree.fromstring(xml_code)
3830 last_type = idoc[-1]
3831 filename = last_type.findall('./filename')[0].text
3832 duration = float(last_type.findall('./duration')[0].text)
3834 video_url = 'http://video2.spiegel.de/flash/' + filename
3835 video_ext = filename.rpartition('.')[2]
3840 'title': video_title,
3841 'duration': duration,
3845 class LiveLeakIE(InfoExtractor):
3847 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3848 IE_NAME = u'liveleak'
3850 def _real_extract(self, url):
3851 mobj = re.match(self._VALID_URL, url)
3853 raise ExtractorError(u'Invalid URL: %s' % url)
3855 video_id = mobj.group('video_id')
3857 webpage = self._download_webpage(url, video_id)
3859 m = re.search(r'file: "(.*?)",', webpage)
3861 raise ExtractorError(u'Unable to find video url')
3862 video_url = m.group(1)
3864 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3866 raise ExtractorError(u'Cannot find video title')
3867 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3869 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3871 desc = unescapeHTML(m.group('desc'))
3875 m = re.search(r'By:.*?(\w+)</a>', webpage)
3877 uploader = clean_html(m.group(1))
3886 'description': desc,
3887 'uploader': uploader
3892 class ARDIE(InfoExtractor):
3893 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3894 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3895 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3897 def _real_extract(self, url):
3898 # determine video id from url
3899 m = re.match(self._VALID_URL, url)
3901 numid = re.search(r'documentId=([0-9]+)', url)
3903 video_id = numid.group(1)
3905 video_id = m.group('video_id')
3907 # determine title and media streams from webpage
3908 html = self._download_webpage(url, video_id)
3909 title = re.search(self._TITLE, html).group('title')
3910 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3912 assert '"fsk"' in html
3913 raise ExtractorError(u'This video is only available after 8:00 pm')
3915 # choose default media type and highest quality for now
3916 stream = max([s for s in streams if int(s["media_type"]) == 0],
3917 key=lambda s: int(s["quality"]))
3919 # there's two possibilities: RTMP stream or HTTP download
3920 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3921 if stream['rtmp_url']:
3922 self.to_screen(u'RTMP download detected')
3923 assert stream['video_url'].startswith('mp4:')
3924 info["url"] = stream["rtmp_url"]
3925 info["play_path"] = stream['video_url']
3927 assert stream["video_url"].endswith('.mp4')
3928 info["url"] = stream["video_url"]
3931 class TumblrIE(InfoExtractor):
3932 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3934 def _real_extract(self, url):
3935 m_url = re.match(self._VALID_URL, url)
3936 video_id = m_url.group('id')
3937 blog = m_url.group('blog_name')
3939 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3940 webpage = self._download_webpage(url, video_id)
3942 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3943 video = re.search(re_video, webpage)
3945 self.to_screen("No video founded")
3947 video_url = video.group('video_url')
3948 ext = video.group('ext')
3950 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3951 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3953 # The only place where you can get a title, it's not complete,
3954 # but searching in other places doesn't work for all videos
3955 re_title = r'<title>(?P<title>.*?)</title>'
3956 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3958 return [{'id': video_id,
3965 class BandcampIE(InfoExtractor):
3966 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3968 def _real_extract(self, url):
3969 mobj = re.match(self._VALID_URL, url)
3970 title = mobj.group('title')
3971 webpage = self._download_webpage(url, title)
3972 # We get the link to the free download page
3973 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3974 if m_download is None:
3975 raise ExtractorError(u'No free songs founded')
3977 download_link = m_download.group(1)
3978 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3979 webpage, re.MULTILINE|re.DOTALL).group('id')
3981 download_webpage = self._download_webpage(download_link, id,
3982 'Downloading free downloads page')
3983 # We get the dictionary of the track from some javascrip code
3984 info = re.search(r'items: (.*?),$',
3985 download_webpage, re.MULTILINE).group(1)
3986 info = json.loads(info)[0]
3987 # We pick mp3-320 for now, until format selection can be easily implemented.
3988 mp3_info = info[u'downloads'][u'mp3-320']
3989 # If we try to use this url it says the link has expired
3990 initial_url = mp3_info[u'url']
3991 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3992 m_url = re.match(re_url, initial_url)
3993 #We build the url we will use to get the final track url
3994 # This url is build in Bandcamp in the script download_bunde_*.js
3995 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3996 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3997 # If we could correctly generate the .rand field the url would be
3998 #in the "download_url" key
3999 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4001 track_info = {'id':id,
4002 'title' : info[u'title'],
4005 'thumbnail' : info[u'thumb_url'],
4006 'uploader' : info[u'artist']
4011 class RedTubeIE(InfoExtractor):
4012 """Information Extractor for redtube"""
4013 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4015 def _real_extract(self,url):
4016 mobj = re.match(self._VALID_URL, url)
4018 raise ExtractorError(u'Invalid URL: %s' % url)
4020 video_id = mobj.group('id')
4021 video_extension = 'mp4'
4022 webpage = self._download_webpage(url, video_id)
4023 self.report_extraction(video_id)
4024 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4027 raise ExtractorError(u'Unable to extract media URL')
4029 video_url = mobj.group(1)
4030 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4032 raise ExtractorError(u'Unable to extract title')
4033 video_title = mobj.group(1)
4038 'ext': video_extension,
4039 'title': video_title,
4042 class InaIE(InfoExtractor):
4043 """Information Extractor for Ina.fr"""
4044 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4046 def _real_extract(self,url):
4047 mobj = re.match(self._VALID_URL, url)
4049 video_id = mobj.group('id')
4050 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4051 video_extension = 'mp4'
4052 webpage = self._download_webpage(mrss_url, video_id)
4054 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4056 raise ExtractorError(u'Unable to extract media URL')
4057 video_url = mobj.group(1)
4059 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4061 raise ExtractorError(u'Unable to extract title')
4062 video_title = mobj.group(1)
4067 'ext': video_extension,
4068 'title': video_title,
4071 def gen_extractors():
4072 """ Return a list of an instance of every supported extractor.
4073 The order does matter; the first extractor matched is the one handling the URL.
4076 YoutubePlaylistIE(),
4101 StanfordOpenClassroomIE(),
4111 WorldStarHipHopIE(),
4131 def get_info_extractor(ie_name):
4132 """Returns the info extractor class with the given ie_name"""
4133 return globals()[ie_name+'IE']