2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
186 video_info['id'] = playlist_id
188 video_info['title'] = playlist_title
191 class SearchInfoExtractor(InfoExtractor):
193 Base class for paged search queries extractors.
194 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
195 Instances should define _SEARCH_KEY and _MAX_RESULTS.
199 def _make_valid_url(cls):
200 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
203 def suitable(cls, url):
204 return re.match(cls._make_valid_url(), url) is not None
206 def _real_extract(self, query):
207 mobj = re.match(self._make_valid_url(), query)
209 raise ExtractorError(u'Invalid search query "%s"' % query)
211 prefix = mobj.group('prefix')
212 query = mobj.group('query')
214 return self._get_n_results(query, 1)
215 elif prefix == 'all':
216 return self._get_n_results(query, self._MAX_RESULTS)
220 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
221 elif n > self._MAX_RESULTS:
222 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
223 n = self._MAX_RESULTS
224 return self._get_n_results(query, n)
226 def _get_n_results(self, query, n):
227 """Get a specified number of results for a query"""
228 raise NotImplementedError("This method must be implemented by sublclasses")
231 class YoutubeIE(InfoExtractor):
232 """Information extractor for youtube.com."""
236 (?:https?://)? # http(s):// (optional)
237 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
238 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
239 (?:.*?\#/)? # handle anchor (#/) redirect urls
240 (?: # the various things that can precede the ID:
241 (?:(?:v|embed|e)/) # v/ or embed/ or e/
242 |(?: # or the v= param in all its forms
243 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
244 (?:\?|\#!?) # the params delimiter ? or # or #!
245 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
248 )? # optional -> youtube.com/xxxx is OK
249 )? # all until now is optional -> you can pass the naked ID
250 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
251 (?(1).+)? # if we found the ID, everything can follow
253 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
254 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
255 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
256 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
257 _NETRC_MACHINE = 'youtube'
258 # Listed in order of quality
259 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
260 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
261 _video_extensions = {
267 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
273 _video_dimensions = {
292 def suitable(cls, url):
293 """Receives a URL and returns True if suitable for this IE."""
294 if YoutubePlaylistIE.suitable(url): return False
295 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
297 def report_lang(self):
298 """Report attempt to set language."""
299 self.to_screen(u'Setting language')
301 def report_login(self):
302 """Report attempt to log in."""
303 self.to_screen(u'Logging in')
305 def report_video_webpage_download(self, video_id):
306 """Report attempt to download video webpage."""
307 self.to_screen(u'%s: Downloading video webpage' % video_id)
309 def report_video_info_webpage_download(self, video_id):
310 """Report attempt to download video info webpage."""
311 self.to_screen(u'%s: Downloading video info webpage' % video_id)
313 def report_video_subtitles_download(self, video_id):
314 """Report attempt to download video info webpage."""
315 self.to_screen(u'%s: Checking available subtitles' % video_id)
317 def report_video_subtitles_request(self, video_id, sub_lang, format):
318 """Report attempt to download video info webpage."""
319 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
321 def report_video_subtitles_available(self, video_id, sub_lang_list):
322 """Report available subtitles."""
323 sub_lang = ",".join(list(sub_lang_list.keys()))
324 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
326 def report_information_extraction(self, video_id):
327 """Report attempt to extract video information."""
328 self.to_screen(u'%s: Extracting video information' % video_id)
330 def report_unavailable_format(self, video_id, format):
331 """Report extracted video URL."""
332 self.to_screen(u'%s: Format %s not available' % (video_id, format))
334 def report_rtmp_download(self):
335 """Indicate the download will use the RTMP protocol."""
336 self.to_screen(u'RTMP download detected')
338 def _get_available_subtitles(self, video_id):
339 self.report_video_subtitles_download(video_id)
340 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
342 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
344 return (u'unable to download video subtitles: %s' % compat_str(err), None)
345 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
346 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
347 if not sub_lang_list:
348 return (u'video doesn\'t have subtitles', None)
351 def _list_available_subtitles(self, video_id):
352 sub_lang_list = self._get_available_subtitles(video_id)
353 self.report_video_subtitles_available(video_id, sub_lang_list)
355 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
358 (error_message, sub_lang, sub)
360 self.report_video_subtitles_request(video_id, sub_lang, format)
361 params = compat_urllib_parse.urlencode({
367 url = 'http://www.youtube.com/api/timedtext?' + params
369 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
371 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
373 return (u'Did not fetch video subtitles', None, None)
374 return (None, sub_lang, sub)
376 def _extract_subtitle(self, video_id):
378 Return a list with a tuple:
379 [(error_message, sub_lang, sub)]
381 sub_lang_list = self._get_available_subtitles(video_id)
382 sub_format = self._downloader.params.get('subtitlesformat')
383 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
384 return [(sub_lang_list[0], None, None)]
385 if self._downloader.params.get('subtitleslang', False):
386 sub_lang = self._downloader.params.get('subtitleslang')
387 elif 'en' in sub_lang_list:
390 sub_lang = list(sub_lang_list.keys())[0]
391 if not sub_lang in sub_lang_list:
392 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
394 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
397 def _extract_all_subtitles(self, video_id):
398 sub_lang_list = self._get_available_subtitles(video_id)
399 sub_format = self._downloader.params.get('subtitlesformat')
400 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
401 return [(sub_lang_list[0], None, None)]
403 for sub_lang in sub_lang_list:
404 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
405 subtitles.append(subtitle)
408 def _print_formats(self, formats):
409 print('Available formats:')
411 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
413 def _real_initialize(self):
414 if self._downloader is None:
419 downloader_params = self._downloader.params
421 # Attempt to use provided username and password or .netrc data
422 if downloader_params.get('username', None) is not None:
423 username = downloader_params['username']
424 password = downloader_params['password']
425 elif downloader_params.get('usenetrc', False):
427 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
432 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
433 except (IOError, netrc.NetrcParseError) as err:
434 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
438 request = compat_urllib_request.Request(self._LANG_URL)
441 compat_urllib_request.urlopen(request).read()
442 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
443 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
446 # No authentication to be performed
450 request = compat_urllib_request.Request(self._LOGIN_URL)
452 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
454 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
459 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
461 galx = match.group(1)
463 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
469 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
473 u'PersistentCookie': u'yes',
475 u'bgresponse': u'js_disabled',
476 u'checkConnection': u'',
477 u'checkedDomains': u'youtube',
483 u'signIn': u'Sign in',
485 u'service': u'youtube',
489 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
491 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
492 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
493 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
496 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
497 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
498 self._downloader.report_warning(u'unable to log in: bad username or password')
500 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
501 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
507 'action_confirm': 'Confirm',
509 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
511 self.report_age_confirmation()
512 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
514 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
516 def _extract_id(self, url):
517 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
519 raise ExtractorError(u'Invalid URL: %s' % url)
520 video_id = mobj.group(2)
523 def _real_extract(self, url):
524 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
525 mobj = re.search(self._NEXT_URL_RE, url)
527 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
528 video_id = self._extract_id(url)
531 self.report_video_webpage_download(video_id)
532 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
533 request = compat_urllib_request.Request(url)
535 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
537 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
539 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
541 # Attempt to extract SWF player URL
542 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
544 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
549 self.report_video_info_webpage_download(video_id)
550 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
551 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
552 % (video_id, el_type))
553 video_info_webpage = self._download_webpage(video_info_url, video_id,
555 errnote='unable to download video info webpage')
556 video_info = compat_parse_qs(video_info_webpage)
557 if 'token' in video_info:
559 if 'token' not in video_info:
560 if 'reason' in video_info:
561 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
563 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
565 # Check for "rental" videos
566 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
567 raise ExtractorError(u'"rental" videos not supported')
569 # Start extracting information
570 self.report_information_extraction(video_id)
573 if 'author' not in video_info:
574 raise ExtractorError(u'Unable to extract uploader name')
575 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
578 video_uploader_id = None
579 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
581 video_uploader_id = mobj.group(1)
583 self._downloader.report_warning(u'unable to extract uploader nickname')
586 if 'title' not in video_info:
587 raise ExtractorError(u'Unable to extract video title')
588 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
591 if 'thumbnail_url' not in video_info:
592 self._downloader.report_warning(u'unable to extract video thumbnail')
594 else: # don't panic if we can't find it
595 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
599 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
601 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
602 upload_date = unified_strdate(upload_date)
605 video_description = get_element_by_id("eow-description", video_webpage)
606 if video_description:
607 video_description = clean_html(video_description)
609 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
611 video_description = unescapeHTML(fd_mobj.group(1))
613 video_description = u''
616 video_subtitles = None
618 if self._downloader.params.get('writesubtitles', False):
619 video_subtitles = self._extract_subtitle(video_id)
621 (sub_error, sub_lang, sub) = video_subtitles[0]
623 self._downloader.report_error(sub_error)
625 if self._downloader.params.get('allsubtitles', False):
626 video_subtitles = self._extract_all_subtitles(video_id)
627 for video_subtitle in video_subtitles:
628 (sub_error, sub_lang, sub) = video_subtitle
630 self._downloader.report_error(sub_error)
632 if self._downloader.params.get('listsubtitles', False):
633 sub_lang_list = self._list_available_subtitles(video_id)
636 if 'length_seconds' not in video_info:
637 self._downloader.report_warning(u'unable to extract video duration')
640 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
643 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
645 # Decide which formats to download
646 req_format = self._downloader.params.get('format', None)
648 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
649 self.report_rtmp_download()
650 video_url_list = [(None, video_info['conn'][0])]
651 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
653 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
654 url_data = compat_parse_qs(url_data_str)
655 if 'itag' in url_data and 'url' in url_data:
656 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
657 if not 'ratebypass' in url: url += '&ratebypass=yes'
658 url_map[url_data['itag'][0]] = url
660 format_limit = self._downloader.params.get('format_limit', None)
661 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
662 if format_limit is not None and format_limit in available_formats:
663 format_list = available_formats[available_formats.index(format_limit):]
665 format_list = available_formats
666 existing_formats = [x for x in format_list if x in url_map]
667 if len(existing_formats) == 0:
668 raise ExtractorError(u'no known formats available for video')
669 if self._downloader.params.get('listformats', None):
670 self._print_formats(existing_formats)
672 if req_format is None or req_format == 'best':
673 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
674 elif req_format == 'worst':
675 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
676 elif req_format in ('-1', 'all'):
677 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
679 # Specific formats. We pick the first in a slash-delimeted sequence.
680 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
681 req_formats = req_format.split('/')
682 video_url_list = None
683 for rf in req_formats:
685 video_url_list = [(rf, url_map[rf])]
687 if video_url_list is None:
688 raise ExtractorError(u'requested format not available')
690 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
693 for format_param, video_real_url in video_url_list:
695 video_extension = self._video_extensions.get(format_param, 'flv')
697 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
698 self._video_dimensions.get(format_param, '???'))
702 'url': video_real_url,
703 'uploader': video_uploader,
704 'uploader_id': video_uploader_id,
705 'upload_date': upload_date,
706 'title': video_title,
707 'ext': video_extension,
708 'format': video_format,
709 'thumbnail': video_thumbnail,
710 'description': video_description,
711 'player_url': player_url,
712 'subtitles': video_subtitles,
713 'duration': video_duration
718 class MetacafeIE(InfoExtractor):
719 """Information Extractor for metacafe.com."""
721 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
722 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
723 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
724 IE_NAME = u'metacafe'
726 def report_disclaimer(self):
727 """Report disclaimer retrieval."""
728 self.to_screen(u'Retrieving disclaimer')
730 def _real_initialize(self):
731 # Retrieve disclaimer
732 request = compat_urllib_request.Request(self._DISCLAIMER)
734 self.report_disclaimer()
735 disclaimer = compat_urllib_request.urlopen(request).read()
736 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
737 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
742 'submit': "Continue - I'm over 18",
744 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
746 self.report_age_confirmation()
747 disclaimer = compat_urllib_request.urlopen(request).read()
748 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
749 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
751 def _real_extract(self, url):
752 # Extract id and simplified title from URL
753 mobj = re.match(self._VALID_URL, url)
755 raise ExtractorError(u'Invalid URL: %s' % url)
757 video_id = mobj.group(1)
759 # Check if video comes from YouTube
760 mobj2 = re.match(r'^yt-(.*)$', video_id)
761 if mobj2 is not None:
762 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
764 # Retrieve video webpage to extract further information
765 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
767 # Extract URL, uploader and title from webpage
768 self.report_extraction(video_id)
769 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
771 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
772 video_extension = mediaURL[-3:]
774 # Extract gdaKey if available
775 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
779 gdaKey = mobj.group(1)
780 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
782 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
784 raise ExtractorError(u'Unable to extract media URL')
785 vardict = compat_parse_qs(mobj.group(1))
786 if 'mediaData' not in vardict:
787 raise ExtractorError(u'Unable to extract media URL')
788 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
790 raise ExtractorError(u'Unable to extract media URL')
791 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
792 video_extension = mediaURL[-3:]
793 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
795 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
797 raise ExtractorError(u'Unable to extract title')
798 video_title = mobj.group(1).decode('utf-8')
800 mobj = re.search(r'submitter=(.*?);', webpage)
802 raise ExtractorError(u'Unable to extract uploader nickname')
803 video_uploader = mobj.group(1)
806 'id': video_id.decode('utf-8'),
807 'url': video_url.decode('utf-8'),
808 'uploader': video_uploader.decode('utf-8'),
810 'title': video_title,
811 'ext': video_extension.decode('utf-8'),
814 class DailymotionIE(InfoExtractor):
815 """Information Extractor for Dailymotion"""
817 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
818 IE_NAME = u'dailymotion'
820 def _real_extract(self, url):
821 # Extract id and simplified title from URL
822 mobj = re.match(self._VALID_URL, url)
824 raise ExtractorError(u'Invalid URL: %s' % url)
826 video_id = mobj.group(1).split('_')[0].split('?')[0]
828 video_extension = 'mp4'
830 # Retrieve video webpage to extract further information
831 request = compat_urllib_request.Request(url)
832 request.add_header('Cookie', 'family_filter=off')
833 webpage = self._download_webpage(request, video_id)
835 # Extract URL, uploader and title from webpage
836 self.report_extraction(video_id)
837 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
839 raise ExtractorError(u'Unable to extract media URL')
840 flashvars = compat_urllib_parse.unquote(mobj.group(1))
842 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
845 self.to_screen(u'Using %s' % key)
848 raise ExtractorError(u'Unable to extract video URL')
850 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
852 raise ExtractorError(u'Unable to extract video URL')
854 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
856 # TODO: support choosing qualities
858 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
860 raise ExtractorError(u'Unable to extract title')
861 video_title = unescapeHTML(mobj.group('title'))
863 video_uploader = None
864 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
866 # lookin for official user
867 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
868 if mobj_official is None:
869 self._downloader.report_warning(u'unable to extract uploader nickname')
871 video_uploader = mobj_official.group(1)
873 video_uploader = mobj.group(1)
875 video_upload_date = None
876 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
878 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
883 'uploader': video_uploader,
884 'upload_date': video_upload_date,
885 'title': video_title,
886 'ext': video_extension,
890 class PhotobucketIE(InfoExtractor):
891 """Information extractor for photobucket.com."""
893 # TODO: the original _VALID_URL was:
894 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
895 # Check if it's necessary to keep the old extracion process
896 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
897 IE_NAME = u'photobucket'
899 def _real_extract(self, url):
900 # Extract id from URL
901 mobj = re.match(self._VALID_URL, url)
903 raise ExtractorError(u'Invalid URL: %s' % url)
905 video_id = mobj.group('id')
907 video_extension = mobj.group('ext')
909 # Retrieve video webpage to extract further information
910 webpage = self._download_webpage(url, video_id)
912 # Extract URL, uploader, and title from webpage
913 self.report_extraction(video_id)
914 # We try first by looking the javascript code:
915 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
917 info = json.loads(mobj.group('json'))
920 'url': info[u'downloadUrl'],
921 'uploader': info[u'username'],
922 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
923 'title': info[u'title'],
924 'ext': video_extension,
925 'thumbnail': info[u'thumbUrl'],
928 # We try looking in other parts of the webpage
929 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
931 raise ExtractorError(u'Unable to extract media URL')
932 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
936 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
938 raise ExtractorError(u'Unable to extract title')
939 video_title = mobj.group(1).decode('utf-8')
941 video_uploader = mobj.group(2).decode('utf-8')
944 'id': video_id.decode('utf-8'),
945 'url': video_url.decode('utf-8'),
946 'uploader': video_uploader,
948 'title': video_title,
949 'ext': video_extension.decode('utf-8'),
953 class YahooIE(InfoExtractor):
954 """Information extractor for screen.yahoo.com."""
955 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
957 def _real_extract(self, url):
958 mobj = re.match(self._VALID_URL, url)
960 raise ExtractorError(u'Invalid URL: %s' % url)
961 video_id = mobj.group('id')
962 webpage = self._download_webpage(url, video_id)
963 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
966 # TODO: Check which url parameters are required
967 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
968 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
969 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
970 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
971 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
972 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
974 self.report_extraction(video_id)
975 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
977 raise ExtractorError(u'Unable to extract video info')
978 video_title = m_info.group('title')
979 video_description = m_info.group('description')
980 video_thumb = m_info.group('thumb')
981 video_date = m_info.group('date')
982 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
984 # TODO: Find a way to get mp4 videos
985 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
986 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
987 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
988 video_url = m_rest.group('url')
989 video_path = m_rest.group('path')
991 raise ExtractorError(u'Unable to extract video url')
993 else: # We have to use a different method if another id is defined
994 long_id = m_id.group('new_id')
995 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
996 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
997 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
998 info = json.loads(json_str)
999 res = info[u'query'][u'results'][u'mediaObj'][0]
1000 stream = res[u'streams'][0]
1001 video_path = stream[u'path']
1002 video_url = stream[u'host']
1004 video_title = meta[u'title']
1005 video_description = meta[u'description']
1006 video_thumb = meta[u'thumbnail']
1007 video_date = None # I can't find it
1012 'play_path': video_path,
1013 'title':video_title,
1014 'description': video_description,
1015 'thumbnail': video_thumb,
1016 'upload_date': video_date,
1021 class VimeoIE(InfoExtractor):
1022 """Information extractor for vimeo.com."""
1024 # _VALID_URL matches Vimeo URLs
1025 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1028 def _real_extract(self, url, new_video=True):
1029 # Extract ID from URL
1030 mobj = re.match(self._VALID_URL, url)
1032 raise ExtractorError(u'Invalid URL: %s' % url)
1034 video_id = mobj.group('id')
1035 if not mobj.group('proto'):
1036 url = 'https://' + url
1037 if mobj.group('direct_link'):
1038 url = 'https://vimeo.com/' + video_id
1040 # Retrieve video webpage to extract further information
1041 request = compat_urllib_request.Request(url, None, std_headers)
1042 webpage = self._download_webpage(request, video_id)
1044 # Now we begin extracting as much information as we can from what we
1045 # retrieved. First we extract the information common to all extractors,
1046 # and latter we extract those that are Vimeo specific.
1047 self.report_extraction(video_id)
1049 # Extract the config JSON
1051 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1052 config = json.loads(config)
1054 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1055 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1057 raise ExtractorError(u'Unable to extract info section')
1060 video_title = config["video"]["title"]
1062 # Extract uploader and uploader_id
1063 video_uploader = config["video"]["owner"]["name"]
1064 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1066 # Extract video thumbnail
1067 video_thumbnail = config["video"]["thumbnail"]
1069 # Extract video description
1070 video_description = get_element_by_attribute("itemprop", "description", webpage)
1071 if video_description: video_description = clean_html(video_description)
1072 else: video_description = u''
1074 # Extract upload date
1075 video_upload_date = None
1076 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1077 if mobj is not None:
1078 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1080 # Vimeo specific: extract request signature and timestamp
1081 sig = config['request']['signature']
1082 timestamp = config['request']['timestamp']
1084 # Vimeo specific: extract video codec and quality information
1085 # First consider quality, then codecs, then take everything
1086 # TODO bind to format param
1087 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1088 files = { 'hd': [], 'sd': [], 'other': []}
1089 for codec_name, codec_extension in codecs:
1090 if codec_name in config["video"]["files"]:
1091 if 'hd' in config["video"]["files"][codec_name]:
1092 files['hd'].append((codec_name, codec_extension, 'hd'))
1093 elif 'sd' in config["video"]["files"][codec_name]:
1094 files['sd'].append((codec_name, codec_extension, 'sd'))
1096 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1098 for quality in ('hd', 'sd', 'other'):
1099 if len(files[quality]) > 0:
1100 video_quality = files[quality][0][2]
1101 video_codec = files[quality][0][0]
1102 video_extension = files[quality][0][1]
1103 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1106 raise ExtractorError(u'No known codec found')
1108 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1109 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1114 'uploader': video_uploader,
1115 'uploader_id': video_uploader_id,
1116 'upload_date': video_upload_date,
1117 'title': video_title,
1118 'ext': video_extension,
1119 'thumbnail': video_thumbnail,
1120 'description': video_description,
1124 class ArteTvIE(InfoExtractor):
1125 """arte.tv information extractor."""
1127 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1128 _LIVE_URL = r'index-[0-9]+\.html$'
1130 IE_NAME = u'arte.tv'
1132 def fetch_webpage(self, url):
1133 request = compat_urllib_request.Request(url)
1135 self.report_download_webpage(url)
1136 webpage = compat_urllib_request.urlopen(request).read()
1137 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1138 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1139 except ValueError as err:
1140 raise ExtractorError(u'Invalid URL: %s' % url)
1143 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1144 page = self.fetch_webpage(url)
1145 mobj = re.search(regex, page, regexFlags)
1149 raise ExtractorError(u'Invalid URL: %s' % url)
1151 for (i, key, err) in matchTuples:
1152 if mobj.group(i) is None:
1153 raise ExtractorError(err)
1155 info[key] = mobj.group(i)
1159 def extractLiveStream(self, url):
1160 video_lang = url.split('/')[-4]
1161 info = self.grep_webpage(
1163 r'src="(.*?/videothek_js.*?\.js)',
1166 (1, 'url', u'Invalid URL: %s' % url)
1169 http_host = url.split('/')[2]
1170 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1171 info = self.grep_webpage(
1173 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1174 '(http://.*?\.swf).*?' +
1178 (1, 'path', u'could not extract video path: %s' % url),
1179 (2, 'player', u'could not extract video player: %s' % url),
1180 (3, 'url', u'could not extract video url: %s' % url)
1183 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1185 def extractPlus7Stream(self, url):
1186 video_lang = url.split('/')[-3]
1187 info = self.grep_webpage(
1189 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1192 (1, 'url', u'Invalid URL: %s' % url)
1195 next_url = compat_urllib_parse.unquote(info.get('url'))
1196 info = self.grep_webpage(
1198 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1201 (1, 'url', u'Could not find <video> tag: %s' % url)
1204 next_url = compat_urllib_parse.unquote(info.get('url'))
1206 info = self.grep_webpage(
1208 r'<video id="(.*?)".*?>.*?' +
1209 '<name>(.*?)</name>.*?' +
1210 '<dateVideo>(.*?)</dateVideo>.*?' +
1211 '<url quality="hd">(.*?)</url>',
1214 (1, 'id', u'could not extract video id: %s' % url),
1215 (2, 'title', u'could not extract video title: %s' % url),
1216 (3, 'date', u'could not extract video date: %s' % url),
1217 (4, 'url', u'could not extract video url: %s' % url)
1222 'id': info.get('id'),
1223 'url': compat_urllib_parse.unquote(info.get('url')),
1224 'uploader': u'arte.tv',
1225 'upload_date': unified_strdate(info.get('date')),
1226 'title': info.get('title').decode('utf-8'),
1232 def _real_extract(self, url):
1233 video_id = url.split('/')[-1]
1234 self.report_extraction(video_id)
1236 if re.search(self._LIVE_URL, video_id) is not None:
1237 self.extractLiveStream(url)
1240 info = self.extractPlus7Stream(url)
1245 class GenericIE(InfoExtractor):
1246 """Generic last-resort information extractor."""
1249 IE_NAME = u'generic'
1251 def report_download_webpage(self, video_id):
1252 """Report webpage download."""
1253 if not self._downloader.params.get('test', False):
1254 self._downloader.report_warning(u'Falling back on generic information extractor.')
1255 super(GenericIE, self).report_download_webpage(video_id)
1257 def report_following_redirect(self, new_url):
1258 """Report information extraction."""
1259 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1261 def _test_redirect(self, url):
1262 """Check if it is a redirect, like url shorteners, in case return the new url."""
1263 class HeadRequest(compat_urllib_request.Request):
1264 def get_method(self):
1267 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1269 Subclass the HTTPRedirectHandler to make it use our
1270 HeadRequest also on the redirected URL
1272 def redirect_request(self, req, fp, code, msg, headers, newurl):
1273 if code in (301, 302, 303, 307):
1274 newurl = newurl.replace(' ', '%20')
1275 newheaders = dict((k,v) for k,v in req.headers.items()
1276 if k.lower() not in ("content-length", "content-type"))
1277 return HeadRequest(newurl,
1279 origin_req_host=req.get_origin_req_host(),
1282 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1284 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1286 Fallback to GET if HEAD is not allowed (405 HTTP error)
1288 def http_error_405(self, req, fp, code, msg, headers):
1292 newheaders = dict((k,v) for k,v in req.headers.items()
1293 if k.lower() not in ("content-length", "content-type"))
1294 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1296 origin_req_host=req.get_origin_req_host(),
1300 opener = compat_urllib_request.OpenerDirector()
1301 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1302 HTTPMethodFallback, HEADRedirectHandler,
1303 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1304 opener.add_handler(handler())
1306 response = opener.open(HeadRequest(url))
1307 new_url = response.geturl()
1312 self.report_following_redirect(new_url)
1315 def _real_extract(self, url):
1316 new_url = self._test_redirect(url)
1317 if new_url: return [self.url_result(new_url)]
1319 video_id = url.split('/')[-1]
1321 webpage = self._download_webpage(url, video_id)
1322 except ValueError as err:
1323 # since this is the last-resort InfoExtractor, if
1324 # this error is thrown, it'll be thrown here
1325 raise ExtractorError(u'Invalid URL: %s' % url)
1327 self.report_extraction(video_id)
1328 # Start with something easy: JW Player in SWFObject
1329 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1331 # Broaden the search a little bit
1332 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1334 # Broaden the search a little bit: JWPlayer JS loader
1335 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1337 raise ExtractorError(u'Invalid URL: %s' % url)
1339 # It's possible that one of the regexes
1340 # matched, but returned an empty group:
1341 if mobj.group(1) is None:
1342 raise ExtractorError(u'Invalid URL: %s' % url)
1344 video_url = compat_urllib_parse.unquote(mobj.group(1))
1345 video_id = os.path.basename(video_url)
1347 # here's a fun little line of code for you:
1348 video_extension = os.path.splitext(video_id)[1][1:]
1349 video_id = os.path.splitext(video_id)[0]
1351 # it's tempting to parse this further, but you would
1352 # have to take into account all the variations like
1353 # Video Title - Site Name
1354 # Site Name | Video Title
1355 # Video Title - Tagline | Site Name
1356 # and so on and so forth; it's just not practical
1357 mobj = re.search(r'<title>(.*)</title>', webpage)
1359 raise ExtractorError(u'Unable to extract title')
1360 video_title = mobj.group(1)
1362 # video uploader is domain name
1363 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1365 raise ExtractorError(u'Unable to extract title')
1366 video_uploader = mobj.group(1)
1371 'uploader': video_uploader,
1372 'upload_date': None,
1373 'title': video_title,
1374 'ext': video_extension,
1378 class YoutubeSearchIE(SearchInfoExtractor):
1379 """Information Extractor for YouTube search queries."""
1380 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1382 IE_NAME = u'youtube:search'
1383 _SEARCH_KEY = 'ytsearch'
1385 def report_download_page(self, query, pagenum):
1386 """Report attempt to download search page with given number."""
1387 query = query.decode(preferredencoding())
1388 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1390 def _get_n_results(self, query, n):
1391 """Get a specified number of results for a query"""
1397 while (50 * pagenum) < limit:
1398 self.report_download_page(query, pagenum+1)
1399 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1400 request = compat_urllib_request.Request(result_url)
1402 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1404 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1405 api_response = json.loads(data)['data']
1407 if not 'items' in api_response:
1408 raise ExtractorError(u'[youtube] No video results')
1410 new_ids = list(video['id'] for video in api_response['items'])
1411 video_ids += new_ids
1413 limit = min(n, api_response['totalItems'])
1416 if len(video_ids) > n:
1417 video_ids = video_ids[:n]
1418 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1419 return self.playlist_result(videos, query)
1422 class GoogleSearchIE(SearchInfoExtractor):
1423 """Information Extractor for Google Video search queries."""
1424 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1426 IE_NAME = u'video.google:search'
1427 _SEARCH_KEY = 'gvsearch'
1429 def _get_n_results(self, query, n):
1430 """Get a specified number of results for a query"""
1433 '_type': 'playlist',
1438 for pagenum in itertools.count(1):
1439 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1440 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1441 note='Downloading result page ' + str(pagenum))
1443 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1446 'url': mobj.group(1)
1448 res['entries'].append(e)
1450 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1453 class YahooSearchIE(SearchInfoExtractor):
1454 """Information Extractor for Yahoo! Video search queries."""
1457 IE_NAME = u'screen.yahoo:search'
1458 _SEARCH_KEY = 'yvsearch'
1460 def _get_n_results(self, query, n):
1461 """Get a specified number of results for a query"""
1464 '_type': 'playlist',
1468 for pagenum in itertools.count(0):
1469 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1470 webpage = self._download_webpage(result_url, query,
1471 note='Downloading results page '+str(pagenum+1))
1472 info = json.loads(webpage)
1474 results = info[u'results']
1476 for (i, r) in enumerate(results):
1477 if (pagenum * 30) +i >= n:
1479 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1480 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1481 res['entries'].append(e)
1482 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1488 class YoutubePlaylistIE(InfoExtractor):
1489 """Information Extractor for YouTube playlists."""
1491 _VALID_URL = r"""(?:
1496 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1497 \? (?:.*?&)*? (?:p|a|list)=
1500 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1503 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1505 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1507 IE_NAME = u'youtube:playlist'
1510 def suitable(cls, url):
1511 """Receives a URL and returns True if suitable for this IE."""
1512 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1514 def _real_extract(self, url):
1515 # Extract playlist id
1516 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1518 raise ExtractorError(u'Invalid URL: %s' % url)
1520 # Download playlist videos from API
1521 playlist_id = mobj.group(1) or mobj.group(2)
1526 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1527 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1530 response = json.loads(page)
1531 except ValueError as err:
1532 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1534 if 'feed' not in response:
1535 raise ExtractorError(u'Got a malformed response from YouTube API')
1536 playlist_title = response['feed']['title']['$t']
1537 if 'entry' not in response['feed']:
1538 # Number of videos is a multiple of self._MAX_RESULTS
1541 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1542 for entry in response['feed']['entry']
1543 if 'content' in entry ]
1545 if len(response['feed']['entry']) < self._MAX_RESULTS:
1549 videos = [v[1] for v in sorted(videos)]
1551 url_results = [self.url_result(url, 'Youtube') for url in videos]
1552 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1555 class YoutubeChannelIE(InfoExtractor):
1556 """Information Extractor for YouTube channels."""
1558 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1559 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1560 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1561 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1562 IE_NAME = u'youtube:channel'
1564 def extract_videos_from_page(self, page):
1566 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1567 if mobj.group(1) not in ids_in_page:
1568 ids_in_page.append(mobj.group(1))
1571 def _real_extract(self, url):
1572 # Extract channel id
1573 mobj = re.match(self._VALID_URL, url)
1575 raise ExtractorError(u'Invalid URL: %s' % url)
1577 # Download channel page
1578 channel_id = mobj.group(1)
1582 url = self._TEMPLATE_URL % (channel_id, pagenum)
1583 page = self._download_webpage(url, channel_id,
1584 u'Downloading page #%s' % pagenum)
1586 # Extract video identifiers
1587 ids_in_page = self.extract_videos_from_page(page)
1588 video_ids.extend(ids_in_page)
1590 # Download any subsequent channel pages using the json-based channel_ajax query
1591 if self._MORE_PAGES_INDICATOR in page:
1593 pagenum = pagenum + 1
1595 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1596 page = self._download_webpage(url, channel_id,
1597 u'Downloading page #%s' % pagenum)
1599 page = json.loads(page)
1601 ids_in_page = self.extract_videos_from_page(page['content_html'])
1602 video_ids.extend(ids_in_page)
1604 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1607 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1609 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1610 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1611 return [self.playlist_result(url_entries, channel_id)]
1614 class YoutubeUserIE(InfoExtractor):
1615 """Information Extractor for YouTube users."""
1617 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1618 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1619 _GDATA_PAGE_SIZE = 50
1620 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1621 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1622 IE_NAME = u'youtube:user'
1624 def _real_extract(self, url):
1626 mobj = re.match(self._VALID_URL, url)
1628 raise ExtractorError(u'Invalid URL: %s' % url)
1630 username = mobj.group(1)
1632 # Download video ids using YouTube Data API. Result size per
1633 # query is limited (currently to 50 videos) so we need to query
1634 # page by page until there are no video ids - it means we got
1641 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1643 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1644 page = self._download_webpage(gdata_url, username,
1645 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1647 # Extract video identifiers
1650 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1651 if mobj.group(1) not in ids_in_page:
1652 ids_in_page.append(mobj.group(1))
1654 video_ids.extend(ids_in_page)
1656 # A little optimization - if current page is not
1657 # "full", ie. does not contain PAGE_SIZE video ids then
1658 # we can assume that this page is the last one - there
1659 # are no more ids on further pages - no need to query
1662 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1667 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1668 url_results = [self.url_result(url, 'Youtube') for url in urls]
1669 return [self.playlist_result(url_results, playlist_title = username)]
1672 class BlipTVUserIE(InfoExtractor):
1673 """Information Extractor for blip.tv users."""
1675 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1677 IE_NAME = u'blip.tv:user'
1679 def _real_extract(self, url):
1681 mobj = re.match(self._VALID_URL, url)
1683 raise ExtractorError(u'Invalid URL: %s' % url)
1685 username = mobj.group(1)
1687 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1689 page = self._download_webpage(url, username, u'Downloading user page')
1690 mobj = re.search(r'data-users-id="([^"]+)"', page)
1691 page_base = page_base % mobj.group(1)
1694 # Download video ids using BlipTV Ajax calls. Result size per
1695 # query is limited (currently to 12 videos) so we need to query
1696 # page by page until there are no video ids - it means we got
1703 url = page_base + "&page=" + str(pagenum)
1704 page = self._download_webpage(url, username,
1705 u'Downloading video ids from page %d' % pagenum)
1707 # Extract video identifiers
1710 for mobj in re.finditer(r'href="/([^"]+)"', page):
1711 if mobj.group(1) not in ids_in_page:
1712 ids_in_page.append(unescapeHTML(mobj.group(1)))
1714 video_ids.extend(ids_in_page)
1716 # A little optimization - if current page is not
1717 # "full", ie. does not contain PAGE_SIZE video ids then
1718 # we can assume that this page is the last one - there
1719 # are no more ids on further pages - no need to query
1722 if len(ids_in_page) < self._PAGE_SIZE:
1727 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1728 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1729 return [self.playlist_result(url_entries, playlist_title = username)]
1732 class DepositFilesIE(InfoExtractor):
1733 """Information extractor for depositfiles.com"""
1735 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1737 def _real_extract(self, url):
1738 file_id = url.split('/')[-1]
1739 # Rebuild url in english locale
1740 url = 'http://depositfiles.com/en/files/' + file_id
1742 # Retrieve file webpage with 'Free download' button pressed
1743 free_download_indication = { 'gateway_result' : '1' }
1744 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1746 self.report_download_webpage(file_id)
1747 webpage = compat_urllib_request.urlopen(request).read()
1748 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1749 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1751 # Search for the real file URL
1752 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1753 if (mobj is None) or (mobj.group(1) is None):
1754 # Try to figure out reason of the error.
1755 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1756 if (mobj is not None) and (mobj.group(1) is not None):
1757 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1758 raise ExtractorError(u'%s' % restriction_message)
1760 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1762 file_url = mobj.group(1)
1763 file_extension = os.path.splitext(file_url)[1][1:]
1765 # Search for file title
1766 mobj = re.search(r'<b title="(.*?)">', webpage)
1768 raise ExtractorError(u'Unable to extract title')
1769 file_title = mobj.group(1).decode('utf-8')
1772 'id': file_id.decode('utf-8'),
1773 'url': file_url.decode('utf-8'),
1775 'upload_date': None,
1776 'title': file_title,
1777 'ext': file_extension.decode('utf-8'),
1781 class FacebookIE(InfoExtractor):
1782 """Information Extractor for Facebook"""
1784 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1785 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1786 _NETRC_MACHINE = 'facebook'
1787 IE_NAME = u'facebook'
1789 def report_login(self):
1790 """Report attempt to log in."""
1791 self.to_screen(u'Logging in')
1793 def _real_initialize(self):
1794 if self._downloader is None:
1799 downloader_params = self._downloader.params
1801 # Attempt to use provided username and password or .netrc data
1802 if downloader_params.get('username', None) is not None:
1803 useremail = downloader_params['username']
1804 password = downloader_params['password']
1805 elif downloader_params.get('usenetrc', False):
1807 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1808 if info is not None:
1812 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1813 except (IOError, netrc.NetrcParseError) as err:
1814 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1817 if useremail is None:
1826 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1829 login_results = compat_urllib_request.urlopen(request).read()
1830 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1831 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1833 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1834 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1837 def _real_extract(self, url):
1838 mobj = re.match(self._VALID_URL, url)
1840 raise ExtractorError(u'Invalid URL: %s' % url)
1841 video_id = mobj.group('ID')
1843 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1844 webpage = self._download_webpage(url, video_id)
1846 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1847 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1848 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1850 raise ExtractorError(u'Cannot parse data')
1851 data = dict(json.loads(m.group(1)))
1852 params_raw = compat_urllib_parse.unquote(data['params'])
1853 params = json.loads(params_raw)
1854 video_data = params['video_data'][0]
1855 video_url = video_data.get('hd_src')
1857 video_url = video_data['sd_src']
1859 raise ExtractorError(u'Cannot find video URL')
1860 video_duration = int(video_data['video_duration'])
1861 thumbnail = video_data['thumbnail_src']
1863 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1865 raise ExtractorError(u'Cannot find title in webpage')
1866 video_title = unescapeHTML(m.group(1))
1870 'title': video_title,
1873 'duration': video_duration,
1874 'thumbnail': thumbnail,
1879 class BlipTVIE(InfoExtractor):
1880 """Information extractor for blip.tv"""
1882 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1883 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1884 IE_NAME = u'blip.tv'
1886 def report_direct_download(self, title):
1887 """Report information extraction."""
1888 self.to_screen(u'%s: Direct download detected' % title)
1890 def _real_extract(self, url):
1891 mobj = re.match(self._VALID_URL, url)
1893 raise ExtractorError(u'Invalid URL: %s' % url)
1895 urlp = compat_urllib_parse_urlparse(url)
1896 if urlp.path.startswith('/play/'):
1897 request = compat_urllib_request.Request(url)
1898 response = compat_urllib_request.urlopen(request)
1899 redirecturl = response.geturl()
1900 rurlp = compat_urllib_parse_urlparse(redirecturl)
1901 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1902 url = 'http://blip.tv/a/a-' + file_id
1903 return self._real_extract(url)
1910 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1911 request = compat_urllib_request.Request(json_url)
1912 request.add_header('User-Agent', 'iTunes/10.6.1')
1913 self.report_extraction(mobj.group(1))
1916 urlh = compat_urllib_request.urlopen(request)
1917 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1918 basename = url.split('/')[-1]
1919 title,ext = os.path.splitext(basename)
1920 title = title.decode('UTF-8')
1921 ext = ext.replace('.', '')
1922 self.report_direct_download(title)
1927 'upload_date': None,
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1934 if info is None: # Regular URL
1936 json_code_bytes = urlh.read()
1937 json_code = json_code_bytes.decode('utf-8')
1938 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1939 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1942 json_data = json.loads(json_code)
1943 if 'Post' in json_data:
1944 data = json_data['Post']
1948 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1949 video_url = data['media']['url']
1950 umobj = re.match(self._URL_EXT, video_url)
1952 raise ValueError('Can not determine filename extension')
1953 ext = umobj.group(1)
1956 'id': data['item_id'],
1958 'uploader': data['display_name'],
1959 'upload_date': upload_date,
1960 'title': data['title'],
1962 'format': data['media']['mimeType'],
1963 'thumbnail': data['thumbnailUrl'],
1964 'description': data['description'],
1965 'player_url': data['embedUrl'],
1966 'user_agent': 'iTunes/10.6.1',
1968 except (ValueError,KeyError) as err:
1969 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1974 class MyVideoIE(InfoExtractor):
1975 """Information Extractor for myvideo.de."""
1977 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1978 IE_NAME = u'myvideo'
1980 def _real_extract(self,url):
1981 mobj = re.match(self._VALID_URL, url)
1983 raise ExtractorError(u'Invalid URL: %s' % url)
1985 video_id = mobj.group(1)
1988 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1989 webpage = self._download_webpage(webpage_url, video_id)
1991 self.report_extraction(video_id)
1992 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
1995 raise ExtractorError(u'Unable to extract media URL')
1996 video_url = mobj.group(1) + ('/%s.flv' % video_id)
1998 mobj = re.search('<title>([^<]+)</title>', webpage)
2000 raise ExtractorError(u'Unable to extract title')
2002 video_title = mobj.group(1)
2008 'upload_date': None,
2009 'title': video_title,
2013 class ComedyCentralIE(InfoExtractor):
2014 """Information extractor for The Daily Show and Colbert Report """
2016 # urls can be abbreviations like :thedailyshow or :colbert
2017 # urls for episodes like:
2018 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2019 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2020 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2021 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2022 |(https?://)?(www\.)?
2023 (?P<showname>thedailyshow|colbertnation)\.com/
2024 (full-episodes/(?P<episode>.*)|
2026 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2027 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2030 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2032 _video_extensions = {
2040 _video_dimensions = {
2050 def suitable(cls, url):
2051 """Receives a URL and returns True if suitable for this IE."""
2052 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2054 def _print_formats(self, formats):
2055 print('Available formats:')
2057 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2060 def _real_extract(self, url):
2061 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2063 raise ExtractorError(u'Invalid URL: %s' % url)
2065 if mobj.group('shortname'):
2066 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2067 url = u'http://www.thedailyshow.com/full-episodes/'
2069 url = u'http://www.colbertnation.com/full-episodes/'
2070 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2071 assert mobj is not None
2073 if mobj.group('clip'):
2074 if mobj.group('showname') == 'thedailyshow':
2075 epTitle = mobj.group('tdstitle')
2077 epTitle = mobj.group('cntitle')
2080 dlNewest = not mobj.group('episode')
2082 epTitle = mobj.group('showname')
2084 epTitle = mobj.group('episode')
2086 self.report_extraction(epTitle)
2087 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2089 url = htmlHandle.geturl()
2090 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2092 raise ExtractorError(u'Invalid redirected URL: ' + url)
2093 if mobj.group('episode') == '':
2094 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2095 epTitle = mobj.group('episode')
2097 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2099 if len(mMovieParams) == 0:
2100 # The Colbert Report embeds the information in a without
2101 # a URL prefix; so extract the alternate reference
2102 # and then add the URL prefix manually.
2104 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2105 if len(altMovieParams) == 0:
2106 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2108 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2110 uri = mMovieParams[0][1]
2111 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2112 indexXml = self._download_webpage(indexUrl, epTitle,
2113 u'Downloading show index',
2114 u'unable to download episode index')
2118 idoc = xml.etree.ElementTree.fromstring(indexXml)
2119 itemEls = idoc.findall('.//item')
2120 for partNum,itemEl in enumerate(itemEls):
2121 mediaId = itemEl.findall('./guid')[0].text
2122 shortMediaId = mediaId.split(':')[-1]
2123 showId = mediaId.split(':')[-2].replace('.com', '')
2124 officialTitle = itemEl.findall('./title')[0].text
2125 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2127 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2128 compat_urllib_parse.urlencode({'uri': mediaId}))
2129 configXml = self._download_webpage(configUrl, epTitle,
2130 u'Downloading configuration for %s' % shortMediaId)
2132 cdoc = xml.etree.ElementTree.fromstring(configXml)
2134 for rendition in cdoc.findall('.//rendition'):
2135 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2139 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2142 if self._downloader.params.get('listformats', None):
2143 self._print_formats([i[0] for i in turls])
2146 # For now, just pick the highest bitrate
2147 format,rtmp_video_url = turls[-1]
2149 # Get the format arg from the arg stream
2150 req_format = self._downloader.params.get('format', None)
2152 # Select format if we can find one
2155 format, rtmp_video_url = f, v
2158 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2160 raise ExtractorError(u'Cannot transform RTMP url')
2161 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2162 video_url = base + m.group('finalid')
2164 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2169 'upload_date': officialDate,
2174 'description': officialTitle,
2176 results.append(info)
2181 class EscapistIE(InfoExtractor):
2182 """Information extractor for The Escapist """
2184 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2185 IE_NAME = u'escapist'
2187 def _real_extract(self, url):
2188 mobj = re.match(self._VALID_URL, url)
2190 raise ExtractorError(u'Invalid URL: %s' % url)
2191 showName = mobj.group('showname')
2192 videoId = mobj.group('episode')
2194 self.report_extraction(showName)
2195 webPage = self._download_webpage(url, showName)
2197 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2198 description = unescapeHTML(descMatch.group(1))
2199 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2200 imgUrl = unescapeHTML(imgMatch.group(1))
2201 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2202 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2203 configUrlMatch = re.search('config=(.*)$', playerUrl)
2204 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2206 configJSON = self._download_webpage(configUrl, showName,
2207 u'Downloading configuration',
2208 u'unable to download configuration')
2210 # Technically, it's JavaScript, not JSON
2211 configJSON = configJSON.replace("'", '"')
2214 config = json.loads(configJSON)
2215 except (ValueError,) as err:
2216 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2218 playlist = config['playlist']
2219 videoUrl = playlist[1]['url']
2224 'uploader': showName,
2225 'upload_date': None,
2228 'thumbnail': imgUrl,
2229 'description': description,
2230 'player_url': playerUrl,
2235 class CollegeHumorIE(InfoExtractor):
2236 """Information extractor for collegehumor.com"""
2239 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2240 IE_NAME = u'collegehumor'
2242 def report_manifest(self, video_id):
2243 """Report information extraction."""
2244 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2246 def _real_extract(self, url):
2247 mobj = re.match(self._VALID_URL, url)
2249 raise ExtractorError(u'Invalid URL: %s' % url)
2250 video_id = mobj.group('videoid')
2255 'upload_date': None,
2258 self.report_extraction(video_id)
2259 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2261 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2263 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2265 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2267 videoNode = mdoc.findall('./video')[0]
2268 info['description'] = videoNode.findall('./description')[0].text
2269 info['title'] = videoNode.findall('./caption')[0].text
2270 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2271 manifest_url = videoNode.findall('./file')[0].text
2273 raise ExtractorError(u'Invalid metadata XML file')
2275 manifest_url += '?hdcore=2.10.3'
2276 self.report_manifest(video_id)
2278 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2280 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2282 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2284 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2285 node_id = media_node.attrib['url']
2286 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2287 except IndexError as err:
2288 raise ExtractorError(u'Invalid manifest file')
2290 url_pr = compat_urllib_parse_urlparse(manifest_url)
2291 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2298 class XVideosIE(InfoExtractor):
2299 """Information extractor for xvideos.com"""
2301 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2302 IE_NAME = u'xvideos'
2304 def _real_extract(self, url):
2305 mobj = re.match(self._VALID_URL, url)
2307 raise ExtractorError(u'Invalid URL: %s' % url)
2308 video_id = mobj.group(1)
2310 webpage = self._download_webpage(url, video_id)
2312 self.report_extraction(video_id)
2316 mobj = re.search(r'flv_url=(.+?)&', webpage)
2318 raise ExtractorError(u'Unable to extract video url')
2319 video_url = compat_urllib_parse.unquote(mobj.group(1))
2323 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2325 raise ExtractorError(u'Unable to extract video title')
2326 video_title = mobj.group(1)
2329 # Extract video thumbnail
2330 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2332 raise ExtractorError(u'Unable to extract video thumbnail')
2333 video_thumbnail = mobj.group(0)
2339 'upload_date': None,
2340 'title': video_title,
2342 'thumbnail': video_thumbnail,
2343 'description': None,
2349 class SoundcloudIE(InfoExtractor):
2350 """Information extractor for soundcloud.com
2351 To access the media, the uid of the song and a stream token
2352 must be extracted from the page source and the script must make
2353 a request to media.soundcloud.com/crossdomain.xml. Then
2354 the media can be grabbed by requesting from an url composed
2355 of the stream token and uid
2358 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2359 IE_NAME = u'soundcloud'
2361 def report_resolve(self, video_id):
2362 """Report information extraction."""
2363 self.to_screen(u'%s: Resolving id' % video_id)
2365 def _real_extract(self, url):
2366 mobj = re.match(self._VALID_URL, url)
2368 raise ExtractorError(u'Invalid URL: %s' % url)
2370 # extract uploader (which is in the url)
2371 uploader = mobj.group(1)
2372 # extract simple title (uploader + slug of song title)
2373 slug_title = mobj.group(2)
2374 simple_title = uploader + u'-' + slug_title
2375 full_title = '%s/%s' % (uploader, slug_title)
2377 self.report_resolve(full_title)
2379 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2380 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2381 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2383 info = json.loads(info_json)
2384 video_id = info['id']
2385 self.report_extraction(full_title)
2387 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2388 stream_json = self._download_webpage(streams_url, full_title,
2389 u'Downloading stream definitions',
2390 u'unable to download stream definitions')
2392 streams = json.loads(stream_json)
2393 mediaURL = streams['http_mp3_128_url']
2394 upload_date = unified_strdate(info['created_at'])
2399 'uploader': info['user']['username'],
2400 'upload_date': upload_date,
2401 'title': info['title'],
2403 'description': info['description'],
2406 class SoundcloudSetIE(InfoExtractor):
2407 """Information extractor for soundcloud.com sets
2408 To access the media, the uid of the song and a stream token
2409 must be extracted from the page source and the script must make
2410 a request to media.soundcloud.com/crossdomain.xml. Then
2411 the media can be grabbed by requesting from an url composed
2412 of the stream token and uid
2415 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2416 IE_NAME = u'soundcloud:set'
2418 def report_resolve(self, video_id):
2419 """Report information extraction."""
2420 self.to_screen(u'%s: Resolving id' % video_id)
2422 def _real_extract(self, url):
2423 mobj = re.match(self._VALID_URL, url)
2425 raise ExtractorError(u'Invalid URL: %s' % url)
2427 # extract uploader (which is in the url)
2428 uploader = mobj.group(1)
2429 # extract simple title (uploader + slug of song title)
2430 slug_title = mobj.group(2)
2431 simple_title = uploader + u'-' + slug_title
2432 full_title = '%s/sets/%s' % (uploader, slug_title)
2434 self.report_resolve(full_title)
2436 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2437 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2438 info_json = self._download_webpage(resolv_url, full_title)
2441 info = json.loads(info_json)
2442 if 'errors' in info:
2443 for err in info['errors']:
2444 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2447 self.report_extraction(full_title)
2448 for track in info['tracks']:
2449 video_id = track['id']
2451 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2452 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2454 self.report_extraction(video_id)
2455 streams = json.loads(stream_json)
2456 mediaURL = streams['http_mp3_128_url']
2461 'uploader': track['user']['username'],
2462 'upload_date': unified_strdate(track['created_at']),
2463 'title': track['title'],
2465 'description': track['description'],
2470 class InfoQIE(InfoExtractor):
2471 """Information extractor for infoq.com"""
2472 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2474 def _real_extract(self, url):
2475 mobj = re.match(self._VALID_URL, url)
2477 raise ExtractorError(u'Invalid URL: %s' % url)
2479 webpage = self._download_webpage(url, video_id=url)
2480 self.report_extraction(url)
2483 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2485 raise ExtractorError(u'Unable to extract video url')
2486 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2487 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2490 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2492 raise ExtractorError(u'Unable to extract video title')
2493 video_title = mobj.group(1)
2495 # Extract description
2496 video_description = u'No description available.'
2497 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2498 if mobj is not None:
2499 video_description = mobj.group(1)
2501 video_filename = video_url.split('/')[-1]
2502 video_id, extension = video_filename.split('.')
2508 'upload_date': None,
2509 'title': video_title,
2510 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2512 'description': video_description,
2517 class MixcloudIE(InfoExtractor):
2518 """Information extractor for www.mixcloud.com"""
2520 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2521 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2522 IE_NAME = u'mixcloud'
2524 def report_download_json(self, file_id):
2525 """Report JSON download."""
2526 self.to_screen(u'Downloading json')
2528 def get_urls(self, jsonData, fmt, bitrate='best'):
2529 """Get urls from 'audio_formats' section in json"""
2532 bitrate_list = jsonData[fmt]
2533 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2534 bitrate = max(bitrate_list) # select highest
2536 url_list = jsonData[fmt][bitrate]
2537 except TypeError: # we have no bitrate info.
2538 url_list = jsonData[fmt]
2541 def check_urls(self, url_list):
2542 """Returns 1st active url from list"""
2543 for url in url_list:
2545 compat_urllib_request.urlopen(url)
2547 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2552 def _print_formats(self, formats):
2553 print('Available formats:')
2554 for fmt in formats.keys():
2555 for b in formats[fmt]:
2557 ext = formats[fmt][b][0]
2558 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2559 except TypeError: # we have no bitrate info
2560 ext = formats[fmt][0]
2561 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2564 def _real_extract(self, url):
2565 mobj = re.match(self._VALID_URL, url)
2567 raise ExtractorError(u'Invalid URL: %s' % url)
2568 # extract uploader & filename from url
2569 uploader = mobj.group(1).decode('utf-8')
2570 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2572 # construct API request
2573 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2574 # retrieve .json file with links to files
2575 request = compat_urllib_request.Request(file_url)
2577 self.report_download_json(file_url)
2578 jsonData = compat_urllib_request.urlopen(request).read()
2579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2583 json_data = json.loads(jsonData)
2584 player_url = json_data['player_swf_url']
2585 formats = dict(json_data['audio_formats'])
2587 req_format = self._downloader.params.get('format', None)
2590 if self._downloader.params.get('listformats', None):
2591 self._print_formats(formats)
2594 if req_format is None or req_format == 'best':
2595 for format_param in formats.keys():
2596 url_list = self.get_urls(formats, format_param)
2598 file_url = self.check_urls(url_list)
2599 if file_url is not None:
2602 if req_format not in formats:
2603 raise ExtractorError(u'Format is not available')
2605 url_list = self.get_urls(formats, req_format)
2606 file_url = self.check_urls(url_list)
2607 format_param = req_format
2610 'id': file_id.decode('utf-8'),
2611 'url': file_url.decode('utf-8'),
2612 'uploader': uploader.decode('utf-8'),
2613 'upload_date': None,
2614 'title': json_data['name'],
2615 'ext': file_url.split('.')[-1].decode('utf-8'),
2616 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2617 'thumbnail': json_data['thumbnail_url'],
2618 'description': json_data['description'],
2619 'player_url': player_url.decode('utf-8'),
2622 class StanfordOpenClassroomIE(InfoExtractor):
2623 """Information extractor for Stanford's Open ClassRoom"""
2625 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2626 IE_NAME = u'stanfordoc'
2628 def _real_extract(self, url):
2629 mobj = re.match(self._VALID_URL, url)
2631 raise ExtractorError(u'Invalid URL: %s' % url)
2633 if mobj.group('course') and mobj.group('video'): # A specific video
2634 course = mobj.group('course')
2635 video = mobj.group('video')
2637 'id': course + '_' + video,
2639 'upload_date': None,
2642 self.report_extraction(info['id'])
2643 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2644 xmlUrl = baseUrl + video + '.xml'
2646 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2647 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2648 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2649 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2651 info['title'] = mdoc.findall('./title')[0].text
2652 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2654 raise ExtractorError(u'Invalid metadata XML file')
2655 info['ext'] = info['url'].rpartition('.')[2]
2657 elif mobj.group('course'): # A course page
2658 course = mobj.group('course')
2663 'upload_date': None,
2666 coursepage = self._download_webpage(url, info['id'],
2667 note='Downloading course info page',
2668 errnote='Unable to download course info page')
2670 m = re.search('<h1>([^<]+)</h1>', coursepage)
2672 info['title'] = unescapeHTML(m.group(1))
2674 info['title'] = info['id']
2676 m = re.search('<description>([^<]+)</description>', coursepage)
2678 info['description'] = unescapeHTML(m.group(1))
2680 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2683 'type': 'reference',
2684 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2688 for entry in info['list']:
2689 assert entry['type'] == 'reference'
2690 results += self.extract(entry['url'])
2694 'id': 'Stanford OpenClassroom',
2697 'upload_date': None,
2700 self.report_download_webpage(info['id'])
2701 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2703 rootpage = compat_urllib_request.urlopen(rootURL).read()
2704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2705 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2707 info['title'] = info['id']
2709 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2712 'type': 'reference',
2713 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2718 for entry in info['list']:
2719 assert entry['type'] == 'reference'
2720 results += self.extract(entry['url'])
2723 class MTVIE(InfoExtractor):
2724 """Information extractor for MTV.com"""
2726 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2729 def _real_extract(self, url):
2730 mobj = re.match(self._VALID_URL, url)
2732 raise ExtractorError(u'Invalid URL: %s' % url)
2733 if not mobj.group('proto'):
2734 url = 'http://' + url
2735 video_id = mobj.group('videoid')
2737 webpage = self._download_webpage(url, video_id)
2739 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2741 raise ExtractorError(u'Unable to extract song name')
2742 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2743 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2745 raise ExtractorError(u'Unable to extract performer')
2746 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2747 video_title = performer + ' - ' + song_name
2749 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2751 raise ExtractorError(u'Unable to mtvn_uri')
2752 mtvn_uri = mobj.group(1)
2754 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2756 raise ExtractorError(u'Unable to extract content id')
2757 content_id = mobj.group(1)
2759 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2760 self.report_extraction(video_id)
2761 request = compat_urllib_request.Request(videogen_url)
2763 metadataXml = compat_urllib_request.urlopen(request).read()
2764 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2765 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2767 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2768 renditions = mdoc.findall('.//rendition')
2770 # For now, always pick the highest quality.
2771 rendition = renditions[-1]
2774 _,_,ext = rendition.attrib['type'].partition('/')
2775 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2776 video_url = rendition.find('./src').text
2778 raise ExtractorError('Invalid rendition field.')
2783 'uploader': performer,
2784 'upload_date': None,
2785 'title': video_title,
2793 class YoukuIE(InfoExtractor):
2794 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2797 nowTime = int(time.time() * 1000)
2798 random1 = random.randint(1000,1998)
2799 random2 = random.randint(1000,9999)
2801 return "%d%d%d" %(nowTime,random1,random2)
2803 def _get_file_ID_mix_string(self, seed):
2805 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2807 for i in range(len(source)):
2808 seed = (seed * 211 + 30031 ) % 65536
2809 index = math.floor(seed / 65536 * len(source) )
2810 mixed.append(source[int(index)])
2811 source.remove(source[int(index)])
2812 #return ''.join(mixed)
2815 def _get_file_id(self, fileId, seed):
2816 mixed = self._get_file_ID_mix_string(seed)
2817 ids = fileId.split('*')
2821 realId.append(mixed[int(ch)])
2822 return ''.join(realId)
2824 def _real_extract(self, url):
2825 mobj = re.match(self._VALID_URL, url)
2827 raise ExtractorError(u'Invalid URL: %s' % url)
2828 video_id = mobj.group('ID')
2830 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2832 jsondata = self._download_webpage(info_url, video_id)
2834 self.report_extraction(video_id)
2836 config = json.loads(jsondata)
2838 video_title = config['data'][0]['title']
2839 seed = config['data'][0]['seed']
2841 format = self._downloader.params.get('format', None)
2842 supported_format = list(config['data'][0]['streamfileids'].keys())
2844 if format is None or format == 'best':
2845 if 'hd2' in supported_format:
2850 elif format == 'worst':
2858 fileid = config['data'][0]['streamfileids'][format]
2859 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2860 except (UnicodeDecodeError, ValueError, KeyError):
2861 raise ExtractorError(u'Unable to extract info section')
2864 sid = self._gen_sid()
2865 fileid = self._get_file_id(fileid, seed)
2867 #column 8,9 of fileid represent the segment number
2868 #fileid[7:9] should be changed
2869 for index, key in enumerate(keys):
2871 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2872 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2875 'id': '%s_part%02d' % (video_id, index),
2876 'url': download_url,
2878 'upload_date': None,
2879 'title': video_title,
2882 files_info.append(info)
2887 class XNXXIE(InfoExtractor):
2888 """Information extractor for xnxx.com"""
2890 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2892 VIDEO_URL_RE = r'flv_url=(.*?)&'
2893 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2894 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2896 def _real_extract(self, url):
2897 mobj = re.match(self._VALID_URL, url)
2899 raise ExtractorError(u'Invalid URL: %s' % url)
2900 video_id = mobj.group(1)
2902 # Get webpage content
2903 webpage = self._download_webpage(url, video_id)
2905 result = re.search(self.VIDEO_URL_RE, webpage)
2907 raise ExtractorError(u'Unable to extract video url')
2908 video_url = compat_urllib_parse.unquote(result.group(1))
2910 result = re.search(self.VIDEO_TITLE_RE, webpage)
2912 raise ExtractorError(u'Unable to extract video title')
2913 video_title = result.group(1)
2915 result = re.search(self.VIDEO_THUMB_RE, webpage)
2917 raise ExtractorError(u'Unable to extract video thumbnail')
2918 video_thumbnail = result.group(1)
2924 'upload_date': None,
2925 'title': video_title,
2927 'thumbnail': video_thumbnail,
2928 'description': None,
2932 class GooglePlusIE(InfoExtractor):
2933 """Information extractor for plus.google.com."""
2935 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2936 IE_NAME = u'plus.google'
2938 def report_extract_entry(self, url):
2939 """Report downloading extry"""
2940 self.to_screen(u'Downloading entry: %s' % url)
2942 def report_date(self, upload_date):
2943 """Report downloading extry"""
2944 self.to_screen(u'Entry date: %s' % upload_date)
2946 def report_uploader(self, uploader):
2947 """Report downloading extry"""
2948 self.to_screen(u'Uploader: %s' % uploader)
2950 def report_title(self, video_title):
2951 """Report downloading extry"""
2952 self.to_screen(u'Title: %s' % video_title)
2954 def report_extract_vid_page(self, video_page):
2955 """Report information extraction."""
2956 self.to_screen(u'Extracting video page: %s' % video_page)
2958 def _real_extract(self, url):
2959 # Extract id from URL
2960 mobj = re.match(self._VALID_URL, url)
2962 raise ExtractorError(u'Invalid URL: %s' % url)
2964 post_url = mobj.group(0)
2965 video_id = mobj.group(1)
2967 video_extension = 'flv'
2969 # Step 1, Retrieve post webpage to extract further information
2970 self.report_extract_entry(post_url)
2971 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2973 # Extract update date
2975 pattern = 'title="Timestamp">(.*?)</a>'
2976 mobj = re.search(pattern, webpage)
2978 upload_date = mobj.group(1)
2979 # Convert timestring to a format suitable for filename
2980 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2981 upload_date = upload_date.strftime('%Y%m%d')
2982 self.report_date(upload_date)
2986 pattern = r'rel\="author".*?>(.*?)</a>'
2987 mobj = re.search(pattern, webpage)
2989 uploader = mobj.group(1)
2990 self.report_uploader(uploader)
2993 # Get the first line for title
2995 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
2996 mobj = re.search(pattern, webpage)
2998 video_title = mobj.group(1)
2999 self.report_title(video_title)
3001 # Step 2, Stimulate clicking the image box to launch video
3002 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3003 mobj = re.search(pattern, webpage)
3005 raise ExtractorError(u'Unable to extract video page URL')
3007 video_page = mobj.group(1)
3008 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3009 self.report_extract_vid_page(video_page)
3012 # Extract video links on video page
3013 """Extract video links of all sizes"""
3014 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3015 mobj = re.findall(pattern, webpage)
3017 raise ExtractorError(u'Unable to extract video links')
3019 # Sort in resolution
3020 links = sorted(mobj)
3022 # Choose the lowest of the sort, i.e. highest resolution
3023 video_url = links[-1]
3024 # Only get the url. The resolution part in the tuple has no use anymore
3025 video_url = video_url[-1]
3026 # Treat escaped \u0026 style hex
3028 video_url = video_url.decode("unicode_escape")
3029 except AttributeError: # Python 3
3030 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3036 'uploader': uploader,
3037 'upload_date': upload_date,
3038 'title': video_title,
3039 'ext': video_extension,
3042 class NBAIE(InfoExtractor):
3043 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3046 def _real_extract(self, url):
3047 mobj = re.match(self._VALID_URL, url)
3049 raise ExtractorError(u'Invalid URL: %s' % url)
3051 video_id = mobj.group(1)
3052 if video_id.endswith('/index.html'):
3053 video_id = video_id[:-len('/index.html')]
3055 webpage = self._download_webpage(url, video_id)
3057 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3058 def _findProp(rexp, default=None):
3059 m = re.search(rexp, webpage)
3061 return unescapeHTML(m.group(1))
3065 shortened_video_id = video_id.rpartition('/')[2]
3066 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3068 'id': shortened_video_id,
3072 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3073 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3077 class JustinTVIE(InfoExtractor):
3078 """Information extractor for justin.tv and twitch.tv"""
3079 # TODO: One broadcast may be split into multiple videos. The key
3080 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3081 # starts at 1 and increases. Can we treat all parts as one video?
3083 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3085 (?P<channelid>[^/]+)|
3086 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3087 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3091 _JUSTIN_PAGE_LIMIT = 100
3092 IE_NAME = u'justin.tv'
3094 def report_download_page(self, channel, offset):
3095 """Report attempt to download a single page of videos."""
3096 self.to_screen(u'%s: Downloading video information from %d to %d' %
3097 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3099 # Return count of items, list of *valid* items
3100 def _parse_page(self, url, video_id):
3101 webpage = self._download_webpage(url, video_id,
3102 u'Downloading video info JSON',
3103 u'unable to download video info JSON')
3105 response = json.loads(webpage)
3106 if type(response) != list:
3107 error_text = response.get('error', 'unknown error')
3108 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3110 for clip in response:
3111 video_url = clip['video_file_url']
3113 video_extension = os.path.splitext(video_url)[1][1:]
3114 video_date = re.sub('-', '', clip['start_time'][:10])
3115 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3116 video_id = clip['id']
3117 video_title = clip.get('title', video_id)
3121 'title': video_title,
3122 'uploader': clip.get('channel_name', video_uploader_id),
3123 'uploader_id': video_uploader_id,
3124 'upload_date': video_date,
3125 'ext': video_extension,
3127 return (len(response), info)
3129 def _real_extract(self, url):
3130 mobj = re.match(self._VALID_URL, url)
3132 raise ExtractorError(u'invalid URL: %s' % url)
3134 api_base = 'http://api.justin.tv'
3136 if mobj.group('channelid'):
3138 video_id = mobj.group('channelid')
3139 api = api_base + '/channel/archives/%s.json' % video_id
3140 elif mobj.group('chapterid'):
3141 chapter_id = mobj.group('chapterid')
3143 webpage = self._download_webpage(url, chapter_id)
3144 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3146 raise ExtractorError(u'Cannot find archive of a chapter')
3147 archive_id = m.group(1)
3149 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3150 chapter_info_xml = self._download_webpage(api, chapter_id,
3151 note=u'Downloading chapter information',
3152 errnote=u'Chapter information download failed')
3153 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3154 for a in doc.findall('.//archive'):
3155 if archive_id == a.find('./id').text:
3158 raise ExtractorError(u'Could not find chapter in chapter information')
3160 video_url = a.find('./video_file_url').text
3161 video_ext = video_url.rpartition('.')[2] or u'flv'
3163 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3164 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3165 note='Downloading chapter metadata',
3166 errnote='Download of chapter metadata failed')
3167 chapter_info = json.loads(chapter_info_json)
3169 bracket_start = int(doc.find('.//bracket_start').text)
3170 bracket_end = int(doc.find('.//bracket_end').text)
3172 # TODO determine start (and probably fix up file)
3173 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3174 #video_url += u'?start=' + TODO:start_timestamp
3175 # bracket_start is 13290, but we want 51670615
3176 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3177 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3180 'id': u'c' + chapter_id,
3183 'title': chapter_info['title'],
3184 'thumbnail': chapter_info['preview'],
3185 'description': chapter_info['description'],
3186 'uploader': chapter_info['channel']['display_name'],
3187 'uploader_id': chapter_info['channel']['name'],
3191 video_id = mobj.group('videoid')
3192 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3194 self.report_extraction(video_id)
3198 limit = self._JUSTIN_PAGE_LIMIT
3201 self.report_download_page(video_id, offset)
3202 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3203 page_count, page_info = self._parse_page(page_url, video_id)
3204 info.extend(page_info)
3205 if not paged or page_count != limit:
3210 class FunnyOrDieIE(InfoExtractor):
3211 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3213 def _real_extract(self, url):
3214 mobj = re.match(self._VALID_URL, url)
3216 raise ExtractorError(u'invalid URL: %s' % url)
3218 video_id = mobj.group('id')
3219 webpage = self._download_webpage(url, video_id)
3221 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3223 raise ExtractorError(u'Unable to find video information')
3224 video_url = unescapeHTML(m.group('url'))
3226 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3228 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3230 raise ExtractorError(u'Cannot find video title')
3231 title = clean_html(m.group('title'))
3233 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3235 desc = unescapeHTML(m.group('desc'))
3244 'description': desc,
3248 class SteamIE(InfoExtractor):
3249 _VALID_URL = r"""http://store\.steampowered\.com/
3251 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3253 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3257 def suitable(cls, url):
3258 """Receives a URL and returns True if suitable for this IE."""
3259 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3261 def _real_extract(self, url):
3262 m = re.match(self._VALID_URL, url, re.VERBOSE)
3263 gameID = m.group('gameID')
3264 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3265 self.report_age_confirmation()
3266 webpage = self._download_webpage(videourl, gameID)
3267 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3269 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3270 mweb = re.finditer(urlRE, webpage)
3271 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3272 titles = re.finditer(namesRE, webpage)
3273 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3274 thumbs = re.finditer(thumbsRE, webpage)
3276 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3277 video_id = vid.group('videoID')
3278 title = vtitle.group('videoName')
3279 video_url = vid.group('videoURL')
3280 video_thumb = thumb.group('thumbnail')
3282 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3287 'title': unescapeHTML(title),
3288 'thumbnail': video_thumb
3291 return [self.playlist_result(videos, gameID, game_title)]
3293 class UstreamIE(InfoExtractor):
3294 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3295 IE_NAME = u'ustream'
3297 def _real_extract(self, url):
3298 m = re.match(self._VALID_URL, url)
3299 video_id = m.group('videoID')
3300 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3301 webpage = self._download_webpage(url, video_id)
3302 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3303 title = m.group('title')
3304 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3305 uploader = m.group('uploader')
3311 'uploader': uploader
3315 class WorldStarHipHopIE(InfoExtractor):
3316 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3317 IE_NAME = u'WorldStarHipHop'
3319 def _real_extract(self, url):
3320 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3322 m = re.match(self._VALID_URL, url)
3323 video_id = m.group('id')
3325 webpage_src = self._download_webpage(url, video_id)
3327 mobj = re.search(_src_url, webpage_src)
3329 if mobj is not None:
3330 video_url = mobj.group(1)
3331 if 'mp4' in video_url:
3336 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3338 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3341 raise ExtractorError(u'Cannot determine title')
3342 title = mobj.group(1)
3344 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3345 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3346 if mobj is not None:
3347 thumbnail = mobj.group(1)
3349 _title = r"""candytitles.*>(.*)</span>"""
3350 mobj = re.search(_title, webpage_src)
3351 if mobj is not None:
3352 title = mobj.group(1)
3359 'thumbnail' : thumbnail,
3364 class RBMARadioIE(InfoExtractor):
3365 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3367 def _real_extract(self, url):
3368 m = re.match(self._VALID_URL, url)
3369 video_id = m.group('videoID')
3371 webpage = self._download_webpage(url, video_id)
3372 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3374 raise ExtractorError(u'Cannot find metadata')
3375 json_data = m.group(1)
3378 data = json.loads(json_data)
3379 except ValueError as e:
3380 raise ExtractorError(u'Invalid JSON: ' + str(e))
3382 video_url = data['akamai_url'] + '&cbr=256'
3383 url_parts = compat_urllib_parse_urlparse(video_url)
3384 video_ext = url_parts.path.rpartition('.')[2]
3389 'title': data['title'],
3390 'description': data.get('teaser_text'),
3391 'location': data.get('country_of_origin'),
3392 'uploader': data.get('host', {}).get('name'),
3393 'uploader_id': data.get('host', {}).get('slug'),
3394 'thumbnail': data.get('image', {}).get('large_url_2x'),
3395 'duration': data.get('duration'),
3400 class YouPornIE(InfoExtractor):
3401 """Information extractor for youporn.com."""
3402 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3404 def _print_formats(self, formats):
3405 """Print all available formats"""
3406 print(u'Available formats:')
3407 print(u'ext\t\tformat')
3408 print(u'---------------------------------')
3409 for format in formats:
3410 print(u'%s\t\t%s' % (format['ext'], format['format']))
3412 def _specific(self, req_format, formats):
3414 if(x["format"]==req_format):
3418 def _real_extract(self, url):
3419 mobj = re.match(self._VALID_URL, url)
3421 raise ExtractorError(u'Invalid URL: %s' % url)
3423 video_id = mobj.group('videoid')
3425 req = compat_urllib_request.Request(url)
3426 req.add_header('Cookie', 'age_verified=1')
3427 webpage = self._download_webpage(req, video_id)
3429 # Get the video title
3430 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3432 raise ExtractorError(u'Unable to extract video title')
3433 video_title = result.group('title').strip()
3435 # Get the video date
3436 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3438 self._downloader.report_warning(u'unable to extract video date')
3441 upload_date = unified_strdate(result.group('date').strip())
3443 # Get the video uploader
3444 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3446 self._downloader.report_warning(u'unable to extract uploader')
3447 video_uploader = None
3449 video_uploader = result.group('uploader').strip()
3450 video_uploader = clean_html( video_uploader )
3452 # Get all of the formats available
3453 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3454 result = re.search(DOWNLOAD_LIST_RE, webpage)
3456 raise ExtractorError(u'Unable to extract download list')
3457 download_list_html = result.group('download_list').strip()
3459 # Get all of the links from the page
3460 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3461 links = re.findall(LINK_RE, download_list_html)
3462 if(len(links) == 0):
3463 raise ExtractorError(u'ERROR: no known formats available for video')
3465 self.to_screen(u'Links found: %d' % len(links))
3470 # A link looks like this:
3471 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3472 # A path looks like this:
3473 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3474 video_url = unescapeHTML( link )
3475 path = compat_urllib_parse_urlparse( video_url ).path
3476 extension = os.path.splitext( path )[1][1:]
3477 format = path.split('/')[4].split('_')[:2]
3480 format = "-".join( format )
3481 title = u'%s-%s-%s' % (video_title, size, bitrate)
3486 'uploader': video_uploader,
3487 'upload_date': upload_date,
3492 'description': None,
3496 if self._downloader.params.get('listformats', None):
3497 self._print_formats(formats)
3500 req_format = self._downloader.params.get('format', None)
3501 self.to_screen(u'Format: %s' % req_format)
3503 if req_format is None or req_format == 'best':
3505 elif req_format == 'worst':
3506 return [formats[-1]]
3507 elif req_format in ('-1', 'all'):
3510 format = self._specific( req_format, formats )
3512 raise ExtractorError(u'Requested format not available')
3517 class PornotubeIE(InfoExtractor):
3518 """Information extractor for pornotube.com."""
3519 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3521 def _real_extract(self, url):
3522 mobj = re.match(self._VALID_URL, url)
3524 raise ExtractorError(u'Invalid URL: %s' % url)
3526 video_id = mobj.group('videoid')
3527 video_title = mobj.group('title')
3529 # Get webpage content
3530 webpage = self._download_webpage(url, video_id)
3533 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3534 result = re.search(VIDEO_URL_RE, webpage)
3536 raise ExtractorError(u'Unable to extract video url')
3537 video_url = compat_urllib_parse.unquote(result.group('url'))
3539 #Get the uploaded date
3540 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3541 result = re.search(VIDEO_UPLOADED_RE, webpage)
3543 raise ExtractorError(u'Unable to extract video title')
3544 upload_date = unified_strdate(result.group('date'))
3546 info = {'id': video_id,
3549 'upload_date': upload_date,
3550 'title': video_title,
3556 class YouJizzIE(InfoExtractor):
3557 """Information extractor for youjizz.com."""
3558 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3560 def _real_extract(self, url):
3561 mobj = re.match(self._VALID_URL, url)
3563 raise ExtractorError(u'Invalid URL: %s' % url)
3565 video_id = mobj.group('videoid')
3567 # Get webpage content
3568 webpage = self._download_webpage(url, video_id)
3570 # Get the video title
3571 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3573 raise ExtractorError(u'ERROR: unable to extract video title')
3574 video_title = result.group('title').strip()
3576 # Get the embed page
3577 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3579 raise ExtractorError(u'ERROR: unable to extract embed page')
3581 embed_page_url = result.group(0).strip()
3582 video_id = result.group('videoid')
3584 webpage = self._download_webpage(embed_page_url, video_id)
3587 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3589 raise ExtractorError(u'ERROR: unable to extract video url')
3590 video_url = result.group('source')
3592 info = {'id': video_id,
3594 'title': video_title,
3597 'player_url': embed_page_url}
3601 class EightTracksIE(InfoExtractor):
3603 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3605 def _real_extract(self, url):
3606 mobj = re.match(self._VALID_URL, url)
3608 raise ExtractorError(u'Invalid URL: %s' % url)
3609 playlist_id = mobj.group('id')
3611 webpage = self._download_webpage(url, playlist_id)
3613 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3615 raise ExtractorError(u'Cannot find trax information')
3616 json_like = m.group(1)
3617 data = json.loads(json_like)
3619 session = str(random.randint(0, 1000000000))
3621 track_count = data['tracks_count']
3622 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3623 next_url = first_url
3625 for i in itertools.count():
3626 api_json = self._download_webpage(next_url, playlist_id,
3627 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3628 errnote=u'Failed to download song information')
3629 api_data = json.loads(api_json)
3630 track_data = api_data[u'set']['track']
3632 'id': track_data['id'],
3633 'url': track_data['track_file_stream_url'],
3634 'title': track_data['performer'] + u' - ' + track_data['name'],
3635 'raw_title': track_data['name'],
3636 'uploader_id': data['user']['login'],
3640 if api_data['set']['at_last_track']:
3642 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3645 class KeekIE(InfoExtractor):
3646 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3649 def _real_extract(self, url):
3650 m = re.match(self._VALID_URL, url)
3651 video_id = m.group('videoID')
3652 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3653 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3654 webpage = self._download_webpage(url, video_id)
3655 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3656 title = unescapeHTML(m.group('title'))
3657 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3658 uploader = clean_html(m.group('uploader'))
3664 'thumbnail': thumbnail,
3665 'uploader': uploader
3669 class TEDIE(InfoExtractor):
3670 _VALID_URL=r'''http://www\.ted\.com/
3672 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3674 ((?P<type_talk>talks)) # We have a simple talk
3676 (/lang/(.*?))? # The url may contain the language
3677 /(?P<name>\w+) # Here goes the name and then ".html"
3681 def suitable(cls, url):
3682 """Receives a URL and returns True if suitable for this IE."""
3683 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3685 def _real_extract(self, url):
3686 m=re.match(self._VALID_URL, url, re.VERBOSE)
3687 if m.group('type_talk'):
3688 return [self._talk_info(url)]
3690 playlist_id=m.group('playlist_id')
3691 name=m.group('name')
3692 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3693 return [self._playlist_videos_info(url,name,playlist_id)]
3695 def _talk_video_link(self,mediaSlug):
3696 '''Returns the video link for that mediaSlug'''
3697 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3699 def _playlist_videos_info(self,url,name,playlist_id=0):
3700 '''Returns the videos of the playlist'''
3702 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3703 ([.\s]*?)data-playlist_item_id="(\d+)"
3704 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3706 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3707 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3708 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3709 m_names=re.finditer(video_name_RE,webpage)
3711 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3712 m_playlist = re.search(playlist_RE, webpage)
3713 playlist_title = m_playlist.group('playlist_title')
3715 playlist_entries = []
3716 for m_video, m_name in zip(m_videos,m_names):
3717 video_id=m_video.group('video_id')
3718 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3719 playlist_entries.append(self.url_result(talk_url, 'TED'))
3720 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3722 def _talk_info(self, url, video_id=0):
3723 """Return the video for the talk in the url"""
3724 m=re.match(self._VALID_URL, url,re.VERBOSE)
3725 videoName=m.group('name')
3726 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3727 # If the url includes the language we get the title translated
3728 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3729 title=re.search(title_RE, webpage).group('title')
3730 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3731 "id":(?P<videoID>[\d]+).*?
3732 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3733 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3734 thumb_match=re.search(thumb_RE,webpage)
3735 info_match=re.search(info_RE,webpage,re.VERBOSE)
3736 video_id=info_match.group('videoID')
3737 mediaSlug=info_match.group('mediaSlug')
3738 video_url=self._talk_video_link(mediaSlug)
3744 'thumbnail': thumb_match.group('thumbnail')
3748 class MySpassIE(InfoExtractor):
3749 _VALID_URL = r'http://www.myspass.de/.*'
3751 def _real_extract(self, url):
3752 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3754 # video id is the last path element of the URL
3755 # usually there is a trailing slash, so also try the second but last
3756 url_path = compat_urllib_parse_urlparse(url).path
3757 url_parent_path, video_id = os.path.split(url_path)
3759 _, video_id = os.path.split(url_parent_path)
3762 metadata_url = META_DATA_URL_TEMPLATE % video_id
3763 metadata_text = self._download_webpage(metadata_url, video_id)
3764 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3766 # extract values from metadata
3767 url_flv_el = metadata.find('url_flv')
3768 if url_flv_el is None:
3769 raise ExtractorError(u'Unable to extract download url')
3770 video_url = url_flv_el.text
3771 extension = os.path.splitext(video_url)[1][1:]
3772 title_el = metadata.find('title')
3773 if title_el is None:
3774 raise ExtractorError(u'Unable to extract title')
3775 title = title_el.text
3776 format_id_el = metadata.find('format_id')
3777 if format_id_el is None:
3780 format = format_id_el.text
3781 description_el = metadata.find('description')
3782 if description_el is not None:
3783 description = description_el.text
3786 imagePreview_el = metadata.find('imagePreview')
3787 if imagePreview_el is not None:
3788 thumbnail = imagePreview_el.text
3797 'thumbnail': thumbnail,
3798 'description': description
3802 class SpiegelIE(InfoExtractor):
3803 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3805 def _real_extract(self, url):
3806 m = re.match(self._VALID_URL, url)
3807 video_id = m.group('videoID')
3809 webpage = self._download_webpage(url, video_id)
3810 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3812 raise ExtractorError(u'Cannot find title')
3813 video_title = unescapeHTML(m.group(1))
3815 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3816 xml_code = self._download_webpage(xml_url, video_id,
3817 note=u'Downloading XML', errnote=u'Failed to download XML')
3819 idoc = xml.etree.ElementTree.fromstring(xml_code)
3820 last_type = idoc[-1]
3821 filename = last_type.findall('./filename')[0].text
3822 duration = float(last_type.findall('./duration')[0].text)
3824 video_url = 'http://video2.spiegel.de/flash/' + filename
3825 video_ext = filename.rpartition('.')[2]
3830 'title': video_title,
3831 'duration': duration,
3835 class LiveLeakIE(InfoExtractor):
3837 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3838 IE_NAME = u'liveleak'
3840 def _real_extract(self, url):
3841 mobj = re.match(self._VALID_URL, url)
3843 raise ExtractorError(u'Invalid URL: %s' % url)
3845 video_id = mobj.group('video_id')
3847 webpage = self._download_webpage(url, video_id)
3849 m = re.search(r'file: "(.*?)",', webpage)
3851 raise ExtractorError(u'Unable to find video url')
3852 video_url = m.group(1)
3854 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3856 raise ExtractorError(u'Cannot find video title')
3857 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3859 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3861 desc = unescapeHTML(m.group('desc'))
3865 m = re.search(r'By:.*?(\w+)</a>', webpage)
3867 uploader = clean_html(m.group(1))
3876 'description': desc,
3877 'uploader': uploader
3882 class ARDIE(InfoExtractor):
3883 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3884 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3885 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3887 def _real_extract(self, url):
3888 # determine video id from url
3889 m = re.match(self._VALID_URL, url)
3891 numid = re.search(r'documentId=([0-9]+)', url)
3893 video_id = numid.group(1)
3895 video_id = m.group('video_id')
3897 # determine title and media streams from webpage
3898 html = self._download_webpage(url, video_id)
3899 title = re.search(self._TITLE, html).group('title')
3900 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3902 assert '"fsk"' in html
3903 raise ExtractorError(u'This video is only available after 8:00 pm')
3905 # choose default media type and highest quality for now
3906 stream = max([s for s in streams if int(s["media_type"]) == 0],
3907 key=lambda s: int(s["quality"]))
3909 # there's two possibilities: RTMP stream or HTTP download
3910 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3911 if stream['rtmp_url']:
3912 self.to_screen(u'RTMP download detected')
3913 assert stream['video_url'].startswith('mp4:')
3914 info["url"] = stream["rtmp_url"]
3915 info["play_path"] = stream['video_url']
3917 assert stream["video_url"].endswith('.mp4')
3918 info["url"] = stream["video_url"]
3921 class TumblrIE(InfoExtractor):
3922 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3924 def _real_extract(self, url):
3925 m_url = re.match(self._VALID_URL, url)
3926 video_id = m_url.group('id')
3927 blog = m_url.group('blog_name')
3929 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3930 webpage = self._download_webpage(url, video_id)
3932 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3933 video = re.search(re_video, webpage)
3935 self.to_screen("No video founded")
3937 video_url = video.group('video_url')
3938 ext = video.group('ext')
3940 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3941 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3943 # The only place where you can get a title, it's not complete,
3944 # but searching in other places doesn't work for all videos
3945 re_title = r'<title>(?P<title>.*?)</title>'
3946 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3948 return [{'id': video_id,
3955 class BandcampIE(InfoExtractor):
3956 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3958 def _real_extract(self, url):
3959 mobj = re.match(self._VALID_URL, url)
3960 title = mobj.group('title')
3961 webpage = self._download_webpage(url, title)
3962 # We get the link to the free download page
3963 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3964 if m_download is None:
3965 raise ExtractorError(u'No free songs founded')
3967 download_link = m_download.group(1)
3968 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3969 webpage, re.MULTILINE|re.DOTALL).group('id')
3971 download_webpage = self._download_webpage(download_link, id,
3972 'Downloading free downloads page')
3973 # We get the dictionary of the track from some javascrip code
3974 info = re.search(r'items: (.*?),$',
3975 download_webpage, re.MULTILINE).group(1)
3976 info = json.loads(info)[0]
3977 # We pick mp3-320 for now, until format selection can be easily implemented.
3978 mp3_info = info[u'downloads'][u'mp3-320']
3979 # If we try to use this url it says the link has expired
3980 initial_url = mp3_info[u'url']
3981 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3982 m_url = re.match(re_url, initial_url)
3983 #We build the url we will use to get the final track url
3984 # This url is build in Bandcamp in the script download_bunde_*.js
3985 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3986 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3987 # If we could correctly generate the .rand field the url would be
3988 #in the "download_url" key
3989 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3991 track_info = {'id':id,
3992 'title' : info[u'title'],
3995 'thumbnail' : info[u'thumb_url'],
3996 'uploader' : info[u'artist']
4001 class RedTubeIE(InfoExtractor):
4002 """Information Extractor for redtube"""
4003 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4005 def _real_extract(self,url):
4006 mobj = re.match(self._VALID_URL, url)
4008 raise ExtractorError(u'Invalid URL: %s' % url)
4010 video_id = mobj.group('id')
4011 video_extension = 'mp4'
4012 webpage = self._download_webpage(url, video_id)
4013 self.report_extraction(video_id)
4014 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4017 raise ExtractorError(u'Unable to extract media URL')
4019 video_url = mobj.group(1)
4020 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4022 raise ExtractorError(u'Unable to extract title')
4023 video_title = mobj.group(1)
4028 'ext': video_extension,
4029 'title': video_title,
4032 class InaIE(InfoExtractor):
4033 """Information Extractor for Ina.fr"""
4034 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4036 def _real_extract(self,url):
4037 mobj = re.match(self._VALID_URL, url)
4039 video_id = mobj.group('id')
4040 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4041 video_extension = 'mp4'
4042 webpage = self._download_webpage(mrss_url, video_id)
4044 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4046 raise ExtractorError(u'Unable to extract media URL')
4047 video_url = mobj.group(1)
4049 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4051 raise ExtractorError(u'Unable to extract title')
4052 video_title = mobj.group(1)
4057 'ext': video_extension,
4058 'title': video_title,
4061 def gen_extractors():
4062 """ Return a list of an instance of every supported extractor.
4063 The order does matter; the first extractor matched is the one handling the URL.
4066 YoutubePlaylistIE(),
4091 StanfordOpenClassroomIE(),
4101 WorldStarHipHopIE(),
4121 def get_info_extractor(ie_name):
4122 """Returns the info extractor class with the given ie_name"""
4123 return globals()[ie_name+'IE']