2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0]
736 if 'sig' in url_data:
737 url += '&signature=' + url_data['sig'][0]
738 if 'ratebypass' not in url:
739 url += '&ratebypass=yes'
740 url_map[url_data['itag'][0]] = url
742 format_limit = self._downloader.params.get('format_limit', None)
743 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
744 if format_limit is not None and format_limit in available_formats:
745 format_list = available_formats[available_formats.index(format_limit):]
747 format_list = available_formats
748 existing_formats = [x for x in format_list if x in url_map]
749 if len(existing_formats) == 0:
750 raise ExtractorError(u'no known formats available for video')
751 if self._downloader.params.get('listformats', None):
752 self._print_formats(existing_formats)
754 if req_format is None or req_format == 'best':
755 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
756 elif req_format == 'worst':
757 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
758 elif req_format in ('-1', 'all'):
759 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
761 # Specific formats. We pick the first in a slash-delimeted sequence.
762 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
763 req_formats = req_format.split('/')
764 video_url_list = None
765 for rf in req_formats:
767 video_url_list = [(rf, url_map[rf])]
769 if video_url_list is None:
770 raise ExtractorError(u'requested format not available')
772 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
775 for format_param, video_real_url in video_url_list:
777 video_extension = self._video_extensions.get(format_param, 'flv')
779 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
780 self._video_dimensions.get(format_param, '???'))
784 'url': video_real_url,
785 'uploader': video_uploader,
786 'uploader_id': video_uploader_id,
787 'upload_date': upload_date,
788 'title': video_title,
789 'ext': video_extension,
790 'format': video_format,
791 'thumbnail': video_thumbnail,
792 'description': video_description,
793 'player_url': player_url,
794 'subtitles': video_subtitles,
795 'duration': video_duration
800 class MetacafeIE(InfoExtractor):
801 """Information Extractor for metacafe.com."""
803 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
804 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
805 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
806 IE_NAME = u'metacafe'
808 def report_disclaimer(self):
809 """Report disclaimer retrieval."""
810 self.to_screen(u'Retrieving disclaimer')
812 def _real_initialize(self):
813 # Retrieve disclaimer
814 request = compat_urllib_request.Request(self._DISCLAIMER)
816 self.report_disclaimer()
817 disclaimer = compat_urllib_request.urlopen(request).read()
818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
819 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
824 'submit': "Continue - I'm over 18",
826 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
828 self.report_age_confirmation()
829 disclaimer = compat_urllib_request.urlopen(request).read()
830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
831 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
833 def _real_extract(self, url):
834 # Extract id and simplified title from URL
835 mobj = re.match(self._VALID_URL, url)
837 raise ExtractorError(u'Invalid URL: %s' % url)
839 video_id = mobj.group(1)
841 # Check if video comes from YouTube
842 mobj2 = re.match(r'^yt-(.*)$', video_id)
843 if mobj2 is not None:
844 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
846 # Retrieve video webpage to extract further information
847 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
849 # Extract URL, uploader and title from webpage
850 self.report_extraction(video_id)
851 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
853 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
854 video_extension = mediaURL[-3:]
856 # Extract gdaKey if available
857 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
861 gdaKey = mobj.group(1)
862 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
864 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
866 raise ExtractorError(u'Unable to extract media URL')
867 vardict = compat_parse_qs(mobj.group(1))
868 if 'mediaData' not in vardict:
869 raise ExtractorError(u'Unable to extract media URL')
870 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
872 raise ExtractorError(u'Unable to extract media URL')
873 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
874 video_extension = mediaURL[-3:]
875 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
877 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
879 raise ExtractorError(u'Unable to extract title')
880 video_title = mobj.group(1).decode('utf-8')
882 mobj = re.search(r'submitter=(.*?);', webpage)
884 raise ExtractorError(u'Unable to extract uploader nickname')
885 video_uploader = mobj.group(1)
888 'id': video_id.decode('utf-8'),
889 'url': video_url.decode('utf-8'),
890 'uploader': video_uploader.decode('utf-8'),
892 'title': video_title,
893 'ext': video_extension.decode('utf-8'),
896 class DailymotionIE(InfoExtractor):
897 """Information Extractor for Dailymotion"""
899 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
900 IE_NAME = u'dailymotion'
902 def _real_extract(self, url):
903 # Extract id and simplified title from URL
904 mobj = re.match(self._VALID_URL, url)
906 raise ExtractorError(u'Invalid URL: %s' % url)
908 video_id = mobj.group(1).split('_')[0].split('?')[0]
910 video_extension = 'mp4'
912 # Retrieve video webpage to extract further information
913 request = compat_urllib_request.Request(url)
914 request.add_header('Cookie', 'family_filter=off')
915 webpage = self._download_webpage(request, video_id)
917 # Extract URL, uploader and title from webpage
918 self.report_extraction(video_id)
919 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
921 raise ExtractorError(u'Unable to extract media URL')
922 flashvars = compat_urllib_parse.unquote(mobj.group(1))
924 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
927 self.to_screen(u'Using %s' % key)
930 raise ExtractorError(u'Unable to extract video URL')
932 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
934 raise ExtractorError(u'Unable to extract video URL')
936 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
938 # TODO: support choosing qualities
940 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
942 raise ExtractorError(u'Unable to extract title')
943 video_title = unescapeHTML(mobj.group('title'))
945 video_uploader = None
946 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
947 # Looking for official user
948 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
949 webpage, 'video uploader')
951 video_upload_date = None
952 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
954 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
959 'uploader': video_uploader,
960 'upload_date': video_upload_date,
961 'title': video_title,
962 'ext': video_extension,
966 class PhotobucketIE(InfoExtractor):
967 """Information extractor for photobucket.com."""
969 # TODO: the original _VALID_URL was:
970 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
971 # Check if it's necessary to keep the old extracion process
972 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
973 IE_NAME = u'photobucket'
975 def _real_extract(self, url):
976 # Extract id from URL
977 mobj = re.match(self._VALID_URL, url)
979 raise ExtractorError(u'Invalid URL: %s' % url)
981 video_id = mobj.group('id')
983 video_extension = mobj.group('ext')
985 # Retrieve video webpage to extract further information
986 webpage = self._download_webpage(url, video_id)
988 # Extract URL, uploader, and title from webpage
989 self.report_extraction(video_id)
990 # We try first by looking the javascript code:
991 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
993 info = json.loads(mobj.group('json'))
996 'url': info[u'downloadUrl'],
997 'uploader': info[u'username'],
998 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
999 'title': info[u'title'],
1000 'ext': video_extension,
1001 'thumbnail': info[u'thumbUrl'],
1004 # We try looking in other parts of the webpage
1005 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1006 webpage, u'video URL')
1008 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1010 raise ExtractorError(u'Unable to extract title')
1011 video_title = mobj.group(1).decode('utf-8')
1012 video_uploader = mobj.group(2).decode('utf-8')
1015 'id': video_id.decode('utf-8'),
1016 'url': video_url.decode('utf-8'),
1017 'uploader': video_uploader,
1018 'upload_date': None,
1019 'title': video_title,
1020 'ext': video_extension.decode('utf-8'),
1024 class YahooIE(InfoExtractor):
1025 """Information extractor for screen.yahoo.com."""
1026 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1028 def _real_extract(self, url):
1029 mobj = re.match(self._VALID_URL, url)
1031 raise ExtractorError(u'Invalid URL: %s' % url)
1032 video_id = mobj.group('id')
1033 webpage = self._download_webpage(url, video_id)
1034 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1037 # TODO: Check which url parameters are required
1038 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1039 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1040 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1041 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1042 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1043 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1045 self.report_extraction(video_id)
1046 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1048 raise ExtractorError(u'Unable to extract video info')
1049 video_title = m_info.group('title')
1050 video_description = m_info.group('description')
1051 video_thumb = m_info.group('thumb')
1052 video_date = m_info.group('date')
1053 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1055 # TODO: Find a way to get mp4 videos
1056 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1057 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1058 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1059 video_url = m_rest.group('url')
1060 video_path = m_rest.group('path')
1062 raise ExtractorError(u'Unable to extract video url')
1064 else: # We have to use a different method if another id is defined
1065 long_id = m_id.group('new_id')
1066 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1067 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1068 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1069 info = json.loads(json_str)
1070 res = info[u'query'][u'results'][u'mediaObj'][0]
1071 stream = res[u'streams'][0]
1072 video_path = stream[u'path']
1073 video_url = stream[u'host']
1075 video_title = meta[u'title']
1076 video_description = meta[u'description']
1077 video_thumb = meta[u'thumbnail']
1078 video_date = None # I can't find it
1083 'play_path': video_path,
1084 'title':video_title,
1085 'description': video_description,
1086 'thumbnail': video_thumb,
1087 'upload_date': video_date,
1092 class VimeoIE(InfoExtractor):
1093 """Information extractor for vimeo.com."""
1095 # _VALID_URL matches Vimeo URLs
1096 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1099 def _real_extract(self, url, new_video=True):
1100 # Extract ID from URL
1101 mobj = re.match(self._VALID_URL, url)
1103 raise ExtractorError(u'Invalid URL: %s' % url)
1105 video_id = mobj.group('id')
1106 if not mobj.group('proto'):
1107 url = 'https://' + url
1108 if mobj.group('direct_link') or mobj.group('pro'):
1109 url = 'https://vimeo.com/' + video_id
1111 # Retrieve video webpage to extract further information
1112 request = compat_urllib_request.Request(url, None, std_headers)
1113 webpage = self._download_webpage(request, video_id)
1115 # Now we begin extracting as much information as we can from what we
1116 # retrieved. First we extract the information common to all extractors,
1117 # and latter we extract those that are Vimeo specific.
1118 self.report_extraction(video_id)
1120 # Extract the config JSON
1122 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1123 config = json.loads(config)
1125 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1126 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1128 raise ExtractorError(u'Unable to extract info section')
1131 video_title = config["video"]["title"]
1133 # Extract uploader and uploader_id
1134 video_uploader = config["video"]["owner"]["name"]
1135 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1137 # Extract video thumbnail
1138 video_thumbnail = config["video"]["thumbnail"]
1140 # Extract video description
1141 video_description = get_element_by_attribute("itemprop", "description", webpage)
1142 if video_description: video_description = clean_html(video_description)
1143 else: video_description = u''
1145 # Extract upload date
1146 video_upload_date = None
1147 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1148 if mobj is not None:
1149 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1151 # Vimeo specific: extract request signature and timestamp
1152 sig = config['request']['signature']
1153 timestamp = config['request']['timestamp']
1155 # Vimeo specific: extract video codec and quality information
1156 # First consider quality, then codecs, then take everything
1157 # TODO bind to format param
1158 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1159 files = { 'hd': [], 'sd': [], 'other': []}
1160 for codec_name, codec_extension in codecs:
1161 if codec_name in config["video"]["files"]:
1162 if 'hd' in config["video"]["files"][codec_name]:
1163 files['hd'].append((codec_name, codec_extension, 'hd'))
1164 elif 'sd' in config["video"]["files"][codec_name]:
1165 files['sd'].append((codec_name, codec_extension, 'sd'))
1167 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1169 for quality in ('hd', 'sd', 'other'):
1170 if len(files[quality]) > 0:
1171 video_quality = files[quality][0][2]
1172 video_codec = files[quality][0][0]
1173 video_extension = files[quality][0][1]
1174 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1177 raise ExtractorError(u'No known codec found')
1179 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1180 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1185 'uploader': video_uploader,
1186 'uploader_id': video_uploader_id,
1187 'upload_date': video_upload_date,
1188 'title': video_title,
1189 'ext': video_extension,
1190 'thumbnail': video_thumbnail,
1191 'description': video_description,
1195 class ArteTvIE(InfoExtractor):
1196 """arte.tv information extractor."""
1198 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1199 _LIVE_URL = r'index-[0-9]+\.html$'
1201 IE_NAME = u'arte.tv'
1203 def fetch_webpage(self, url):
1204 request = compat_urllib_request.Request(url)
1206 self.report_download_webpage(url)
1207 webpage = compat_urllib_request.urlopen(request).read()
1208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1210 except ValueError as err:
1211 raise ExtractorError(u'Invalid URL: %s' % url)
1214 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1215 page = self.fetch_webpage(url)
1216 mobj = re.search(regex, page, regexFlags)
1220 raise ExtractorError(u'Invalid URL: %s' % url)
1222 for (i, key, err) in matchTuples:
1223 if mobj.group(i) is None:
1224 raise ExtractorError(err)
1226 info[key] = mobj.group(i)
1230 def extractLiveStream(self, url):
1231 video_lang = url.split('/')[-4]
1232 info = self.grep_webpage(
1234 r'src="(.*?/videothek_js.*?\.js)',
1237 (1, 'url', u'Invalid URL: %s' % url)
1240 http_host = url.split('/')[2]
1241 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1242 info = self.grep_webpage(
1244 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1245 '(http://.*?\.swf).*?' +
1249 (1, 'path', u'could not extract video path: %s' % url),
1250 (2, 'player', u'could not extract video player: %s' % url),
1251 (3, 'url', u'could not extract video url: %s' % url)
1254 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1256 def extractPlus7Stream(self, url):
1257 video_lang = url.split('/')[-3]
1258 info = self.grep_webpage(
1260 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1263 (1, 'url', u'Invalid URL: %s' % url)
1266 next_url = compat_urllib_parse.unquote(info.get('url'))
1267 info = self.grep_webpage(
1269 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1272 (1, 'url', u'Could not find <video> tag: %s' % url)
1275 next_url = compat_urllib_parse.unquote(info.get('url'))
1277 info = self.grep_webpage(
1279 r'<video id="(.*?)".*?>.*?' +
1280 '<name>(.*?)</name>.*?' +
1281 '<dateVideo>(.*?)</dateVideo>.*?' +
1282 '<url quality="hd">(.*?)</url>',
1285 (1, 'id', u'could not extract video id: %s' % url),
1286 (2, 'title', u'could not extract video title: %s' % url),
1287 (3, 'date', u'could not extract video date: %s' % url),
1288 (4, 'url', u'could not extract video url: %s' % url)
1293 'id': info.get('id'),
1294 'url': compat_urllib_parse.unquote(info.get('url')),
1295 'uploader': u'arte.tv',
1296 'upload_date': unified_strdate(info.get('date')),
1297 'title': info.get('title').decode('utf-8'),
1303 def _real_extract(self, url):
1304 video_id = url.split('/')[-1]
1305 self.report_extraction(video_id)
1307 if re.search(self._LIVE_URL, video_id) is not None:
1308 self.extractLiveStream(url)
1311 info = self.extractPlus7Stream(url)
1316 class GenericIE(InfoExtractor):
1317 """Generic last-resort information extractor."""
1320 IE_NAME = u'generic'
1322 def report_download_webpage(self, video_id):
1323 """Report webpage download."""
1324 if not self._downloader.params.get('test', False):
1325 self._downloader.report_warning(u'Falling back on generic information extractor.')
1326 super(GenericIE, self).report_download_webpage(video_id)
1328 def report_following_redirect(self, new_url):
1329 """Report information extraction."""
1330 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1332 def _test_redirect(self, url):
1333 """Check if it is a redirect, like url shorteners, in case return the new url."""
1334 class HeadRequest(compat_urllib_request.Request):
1335 def get_method(self):
1338 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1340 Subclass the HTTPRedirectHandler to make it use our
1341 HeadRequest also on the redirected URL
1343 def redirect_request(self, req, fp, code, msg, headers, newurl):
1344 if code in (301, 302, 303, 307):
1345 newurl = newurl.replace(' ', '%20')
1346 newheaders = dict((k,v) for k,v in req.headers.items()
1347 if k.lower() not in ("content-length", "content-type"))
1348 return HeadRequest(newurl,
1350 origin_req_host=req.get_origin_req_host(),
1353 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1355 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1357 Fallback to GET if HEAD is not allowed (405 HTTP error)
1359 def http_error_405(self, req, fp, code, msg, headers):
1363 newheaders = dict((k,v) for k,v in req.headers.items()
1364 if k.lower() not in ("content-length", "content-type"))
1365 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1367 origin_req_host=req.get_origin_req_host(),
1371 opener = compat_urllib_request.OpenerDirector()
1372 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1373 HTTPMethodFallback, HEADRedirectHandler,
1374 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1375 opener.add_handler(handler())
1377 response = opener.open(HeadRequest(url))
1378 if response is None:
1379 raise ExtractorError(u'Invalid URL protocol')
1380 new_url = response.geturl()
1385 self.report_following_redirect(new_url)
1388 def _real_extract(self, url):
1389 new_url = self._test_redirect(url)
1390 if new_url: return [self.url_result(new_url)]
1392 video_id = url.split('/')[-1]
1394 webpage = self._download_webpage(url, video_id)
1395 except ValueError as err:
1396 # since this is the last-resort InfoExtractor, if
1397 # this error is thrown, it'll be thrown here
1398 raise ExtractorError(u'Invalid URL: %s' % url)
1400 self.report_extraction(video_id)
1401 # Start with something easy: JW Player in SWFObject
1402 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit
1405 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1407 # Broaden the search a little bit: JWPlayer JS loader
1408 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1410 # Try to find twitter cards info
1411 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1413 raise ExtractorError(u'Invalid URL: %s' % url)
1415 # It's possible that one of the regexes
1416 # matched, but returned an empty group:
1417 if mobj.group(1) is None:
1418 raise ExtractorError(u'Invalid URL: %s' % url)
1420 video_url = compat_urllib_parse.unquote(mobj.group(1))
1421 video_id = os.path.basename(video_url)
1423 # here's a fun little line of code for you:
1424 video_extension = os.path.splitext(video_id)[1][1:]
1425 video_id = os.path.splitext(video_id)[0]
1427 # it's tempting to parse this further, but you would
1428 # have to take into account all the variations like
1429 # Video Title - Site Name
1430 # Site Name | Video Title
1431 # Video Title - Tagline | Site Name
1432 # and so on and so forth; it's just not practical
1433 video_title = self._html_search_regex(r'<title>(.*)</title>',
1434 webpage, u'video title')
1436 # video uploader is domain name
1437 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1438 url, u'video uploader')
1443 'uploader': video_uploader,
1444 'upload_date': None,
1445 'title': video_title,
1446 'ext': video_extension,
1450 class YoutubeSearchIE(SearchInfoExtractor):
1451 """Information Extractor for YouTube search queries."""
1452 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1454 IE_NAME = u'youtube:search'
1455 _SEARCH_KEY = 'ytsearch'
1457 def report_download_page(self, query, pagenum):
1458 """Report attempt to download search page with given number."""
1459 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1461 def _get_n_results(self, query, n):
1462 """Get a specified number of results for a query"""
1468 while (50 * pagenum) < limit:
1469 self.report_download_page(query, pagenum+1)
1470 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1471 request = compat_urllib_request.Request(result_url)
1473 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1475 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1476 api_response = json.loads(data)['data']
1478 if not 'items' in api_response:
1479 raise ExtractorError(u'[youtube] No video results')
1481 new_ids = list(video['id'] for video in api_response['items'])
1482 video_ids += new_ids
1484 limit = min(n, api_response['totalItems'])
1487 if len(video_ids) > n:
1488 video_ids = video_ids[:n]
1489 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1490 return self.playlist_result(videos, query)
1493 class GoogleSearchIE(SearchInfoExtractor):
1494 """Information Extractor for Google Video search queries."""
1495 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1497 IE_NAME = u'video.google:search'
1498 _SEARCH_KEY = 'gvsearch'
1500 def _get_n_results(self, query, n):
1501 """Get a specified number of results for a query"""
1504 '_type': 'playlist',
1509 for pagenum in itertools.count(1):
1510 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1511 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1512 note='Downloading result page ' + str(pagenum))
1514 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1517 'url': mobj.group(1)
1519 res['entries'].append(e)
1521 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1524 class YahooSearchIE(SearchInfoExtractor):
1525 """Information Extractor for Yahoo! Video search queries."""
1528 IE_NAME = u'screen.yahoo:search'
1529 _SEARCH_KEY = 'yvsearch'
1531 def _get_n_results(self, query, n):
1532 """Get a specified number of results for a query"""
1535 '_type': 'playlist',
1539 for pagenum in itertools.count(0):
1540 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1541 webpage = self._download_webpage(result_url, query,
1542 note='Downloading results page '+str(pagenum+1))
1543 info = json.loads(webpage)
1545 results = info[u'results']
1547 for (i, r) in enumerate(results):
1548 if (pagenum * 30) +i >= n:
1550 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1551 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1552 res['entries'].append(e)
1553 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1559 class YoutubePlaylistIE(InfoExtractor):
1560 """Information Extractor for YouTube playlists."""
1562 _VALID_URL = r"""(?:
1567 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1568 \? (?:.*?&)*? (?:p|a|list)=
1571 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1574 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1576 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1578 IE_NAME = u'youtube:playlist'
1581 def suitable(cls, url):
1582 """Receives a URL and returns True if suitable for this IE."""
1583 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1585 def _real_extract(self, url):
1586 # Extract playlist id
1587 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1589 raise ExtractorError(u'Invalid URL: %s' % url)
1591 # Download playlist videos from API
1592 playlist_id = mobj.group(1) or mobj.group(2)
1597 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1598 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1601 response = json.loads(page)
1602 except ValueError as err:
1603 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1605 if 'feed' not in response:
1606 raise ExtractorError(u'Got a malformed response from YouTube API')
1607 playlist_title = response['feed']['title']['$t']
1608 if 'entry' not in response['feed']:
1609 # Number of videos is a multiple of self._MAX_RESULTS
1612 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1613 for entry in response['feed']['entry']
1614 if 'content' in entry ]
1616 if len(response['feed']['entry']) < self._MAX_RESULTS:
1620 videos = [v[1] for v in sorted(videos)]
1622 url_results = [self.url_result(url, 'Youtube') for url in videos]
1623 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1626 class YoutubeChannelIE(InfoExtractor):
1627 """Information Extractor for YouTube channels."""
1629 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1630 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1631 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1632 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1633 IE_NAME = u'youtube:channel'
1635 def extract_videos_from_page(self, page):
1637 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1638 if mobj.group(1) not in ids_in_page:
1639 ids_in_page.append(mobj.group(1))
1642 def _real_extract(self, url):
1643 # Extract channel id
1644 mobj = re.match(self._VALID_URL, url)
1646 raise ExtractorError(u'Invalid URL: %s' % url)
1648 # Download channel page
1649 channel_id = mobj.group(1)
1653 url = self._TEMPLATE_URL % (channel_id, pagenum)
1654 page = self._download_webpage(url, channel_id,
1655 u'Downloading page #%s' % pagenum)
1657 # Extract video identifiers
1658 ids_in_page = self.extract_videos_from_page(page)
1659 video_ids.extend(ids_in_page)
1661 # Download any subsequent channel pages using the json-based channel_ajax query
1662 if self._MORE_PAGES_INDICATOR in page:
1664 pagenum = pagenum + 1
1666 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1667 page = self._download_webpage(url, channel_id,
1668 u'Downloading page #%s' % pagenum)
1670 page = json.loads(page)
1672 ids_in_page = self.extract_videos_from_page(page['content_html'])
1673 video_ids.extend(ids_in_page)
1675 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1678 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1680 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1681 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1682 return [self.playlist_result(url_entries, channel_id)]
1685 class YoutubeUserIE(InfoExtractor):
1686 """Information Extractor for YouTube users."""
1688 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1689 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1690 _GDATA_PAGE_SIZE = 50
1691 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1692 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1693 IE_NAME = u'youtube:user'
1695 def _real_extract(self, url):
1697 mobj = re.match(self._VALID_URL, url)
1699 raise ExtractorError(u'Invalid URL: %s' % url)
1701 username = mobj.group(1)
1703 # Download video ids using YouTube Data API. Result size per
1704 # query is limited (currently to 50 videos) so we need to query
1705 # page by page until there are no video ids - it means we got
1712 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1714 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1715 page = self._download_webpage(gdata_url, username,
1716 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1718 # Extract video identifiers
1721 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1722 if mobj.group(1) not in ids_in_page:
1723 ids_in_page.append(mobj.group(1))
1725 video_ids.extend(ids_in_page)
1727 # A little optimization - if current page is not
1728 # "full", ie. does not contain PAGE_SIZE video ids then
1729 # we can assume that this page is the last one - there
1730 # are no more ids on further pages - no need to query
1733 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1738 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1739 url_results = [self.url_result(url, 'Youtube') for url in urls]
1740 return [self.playlist_result(url_results, playlist_title = username)]
1743 class BlipTVUserIE(InfoExtractor):
1744 """Information Extractor for blip.tv users."""
1746 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1748 IE_NAME = u'blip.tv:user'
1750 def _real_extract(self, url):
1752 mobj = re.match(self._VALID_URL, url)
1754 raise ExtractorError(u'Invalid URL: %s' % url)
1756 username = mobj.group(1)
1758 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1760 page = self._download_webpage(url, username, u'Downloading user page')
1761 mobj = re.search(r'data-users-id="([^"]+)"', page)
1762 page_base = page_base % mobj.group(1)
1765 # Download video ids using BlipTV Ajax calls. Result size per
1766 # query is limited (currently to 12 videos) so we need to query
1767 # page by page until there are no video ids - it means we got
1774 url = page_base + "&page=" + str(pagenum)
1775 page = self._download_webpage(url, username,
1776 u'Downloading video ids from page %d' % pagenum)
1778 # Extract video identifiers
1781 for mobj in re.finditer(r'href="/([^"]+)"', page):
1782 if mobj.group(1) not in ids_in_page:
1783 ids_in_page.append(unescapeHTML(mobj.group(1)))
1785 video_ids.extend(ids_in_page)
1787 # A little optimization - if current page is not
1788 # "full", ie. does not contain PAGE_SIZE video ids then
1789 # we can assume that this page is the last one - there
1790 # are no more ids on further pages - no need to query
1793 if len(ids_in_page) < self._PAGE_SIZE:
1798 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1799 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1800 return [self.playlist_result(url_entries, playlist_title = username)]
1803 class DepositFilesIE(InfoExtractor):
1804 """Information extractor for depositfiles.com"""
1806 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1808 def _real_extract(self, url):
1809 file_id = url.split('/')[-1]
1810 # Rebuild url in english locale
1811 url = 'http://depositfiles.com/en/files/' + file_id
1813 # Retrieve file webpage with 'Free download' button pressed
1814 free_download_indication = { 'gateway_result' : '1' }
1815 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1817 self.report_download_webpage(file_id)
1818 webpage = compat_urllib_request.urlopen(request).read()
1819 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1820 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1822 # Search for the real file URL
1823 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1824 if (mobj is None) or (mobj.group(1) is None):
1825 # Try to figure out reason of the error.
1826 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1827 if (mobj is not None) and (mobj.group(1) is not None):
1828 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1829 raise ExtractorError(u'%s' % restriction_message)
1831 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1833 file_url = mobj.group(1)
1834 file_extension = os.path.splitext(file_url)[1][1:]
1836 # Search for file title
1837 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1840 'id': file_id.decode('utf-8'),
1841 'url': file_url.decode('utf-8'),
1843 'upload_date': None,
1844 'title': file_title,
1845 'ext': file_extension.decode('utf-8'),
1849 class FacebookIE(InfoExtractor):
1850 """Information Extractor for Facebook"""
1852 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1853 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1854 _NETRC_MACHINE = 'facebook'
1855 IE_NAME = u'facebook'
1857 def report_login(self):
1858 """Report attempt to log in."""
1859 self.to_screen(u'Logging in')
1861 def _real_initialize(self):
1862 if self._downloader is None:
1867 downloader_params = self._downloader.params
1869 # Attempt to use provided username and password or .netrc data
1870 if downloader_params.get('username', None) is not None:
1871 useremail = downloader_params['username']
1872 password = downloader_params['password']
1873 elif downloader_params.get('usenetrc', False):
1875 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1876 if info is not None:
1880 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1881 except (IOError, netrc.NetrcParseError) as err:
1882 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1885 if useremail is None:
1894 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1897 login_results = compat_urllib_request.urlopen(request).read()
1898 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1899 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1901 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1902 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1905 def _real_extract(self, url):
1906 mobj = re.match(self._VALID_URL, url)
1908 raise ExtractorError(u'Invalid URL: %s' % url)
1909 video_id = mobj.group('ID')
1911 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1912 webpage = self._download_webpage(url, video_id)
1914 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1915 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1916 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1918 raise ExtractorError(u'Cannot parse data')
1919 data = dict(json.loads(m.group(1)))
1920 params_raw = compat_urllib_parse.unquote(data['params'])
1921 params = json.loads(params_raw)
1922 video_data = params['video_data'][0]
1923 video_url = video_data.get('hd_src')
1925 video_url = video_data['sd_src']
1927 raise ExtractorError(u'Cannot find video URL')
1928 video_duration = int(video_data['video_duration'])
1929 thumbnail = video_data['thumbnail_src']
1931 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1936 'title': video_title,
1939 'duration': video_duration,
1940 'thumbnail': thumbnail,
1945 class BlipTVIE(InfoExtractor):
1946 """Information extractor for blip.tv"""
1948 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1949 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1950 IE_NAME = u'blip.tv'
1952 def report_direct_download(self, title):
1953 """Report information extraction."""
1954 self.to_screen(u'%s: Direct download detected' % title)
1956 def _real_extract(self, url):
1957 mobj = re.match(self._VALID_URL, url)
1959 raise ExtractorError(u'Invalid URL: %s' % url)
1961 # See https://github.com/rg3/youtube-dl/issues/857
1962 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1963 if api_mobj is not None:
1964 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1965 urlp = compat_urllib_parse_urlparse(url)
1966 if urlp.path.startswith('/play/'):
1967 request = compat_urllib_request.Request(url)
1968 response = compat_urllib_request.urlopen(request)
1969 redirecturl = response.geturl()
1970 rurlp = compat_urllib_parse_urlparse(redirecturl)
1971 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1972 url = 'http://blip.tv/a/a-' + file_id
1973 return self._real_extract(url)
1980 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1981 request = compat_urllib_request.Request(json_url)
1982 request.add_header('User-Agent', 'iTunes/10.6.1')
1983 self.report_extraction(mobj.group(1))
1986 urlh = compat_urllib_request.urlopen(request)
1987 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1988 basename = url.split('/')[-1]
1989 title,ext = os.path.splitext(basename)
1990 title = title.decode('UTF-8')
1991 ext = ext.replace('.', '')
1992 self.report_direct_download(title)
1997 'upload_date': None,
2002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2003 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2004 if info is None: # Regular URL
2006 json_code_bytes = urlh.read()
2007 json_code = json_code_bytes.decode('utf-8')
2008 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2009 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2012 json_data = json.loads(json_code)
2013 if 'Post' in json_data:
2014 data = json_data['Post']
2018 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2019 video_url = data['media']['url']
2020 umobj = re.match(self._URL_EXT, video_url)
2022 raise ValueError('Can not determine filename extension')
2023 ext = umobj.group(1)
2026 'id': data['item_id'],
2028 'uploader': data['display_name'],
2029 'upload_date': upload_date,
2030 'title': data['title'],
2032 'format': data['media']['mimeType'],
2033 'thumbnail': data['thumbnailUrl'],
2034 'description': data['description'],
2035 'player_url': data['embedUrl'],
2036 'user_agent': 'iTunes/10.6.1',
2038 except (ValueError,KeyError) as err:
2039 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2044 class MyVideoIE(InfoExtractor):
2045 """Information Extractor for myvideo.de."""
2047 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2048 IE_NAME = u'myvideo'
2050 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2051 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2052 # https://github.com/rg3/youtube-dl/pull/842
2053 def __rc4crypt(self,data, key):
2055 box = list(range(256))
2056 for i in list(range(256)):
2057 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2058 box[i], box[x] = box[x], box[i]
2064 y = (y + box[x]) % 256
2065 box[x], box[y] = box[y], box[x]
2066 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2070 return hashlib.md5(s).hexdigest().encode()
2072 def _real_extract(self,url):
2073 mobj = re.match(self._VALID_URL, url)
2075 raise ExtractorError(u'invalid URL: %s' % url)
2077 video_id = mobj.group(1)
2080 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2081 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2082 b'TnpsbA0KTVRkbU1tSTRNdz09'
2086 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2087 webpage = self._download_webpage(webpage_url, video_id)
2089 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2090 if mobj is not None:
2091 self.report_extraction(video_id)
2092 video_url = mobj.group(1) + '.flv'
2094 video_title = self._html_search_regex('<title>([^<]+)</title>',
2097 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2103 'upload_date': None,
2104 'title': video_title,
2109 mobj = re.search('var flashvars={(.+?)}', webpage)
2111 raise ExtractorError(u'Unable to extract video')
2116 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2117 if not a == '_encxml':
2120 encxml = compat_urllib_parse.unquote(b)
2121 if not params.get('domain'):
2122 params['domain'] = 'www.myvideo.de'
2123 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2124 if 'flash_playertype=MTV' in xmldata_url:
2125 self._downloader.report_warning(u'avoiding MTV player')
2127 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2128 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2132 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2133 enc_data_b = binascii.unhexlify(enc_data)
2135 base64.b64decode(base64.b64decode(GK)) +
2137 str(video_id).encode('utf-8')
2140 dec_data = self.__rc4crypt(enc_data_b, sk)
2143 self.report_extraction(video_id)
2146 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2148 video_url = compat_urllib_parse.unquote(mobj.group(1))
2149 if 'myvideo2flash' in video_url:
2150 self._downloader.report_warning(u'forcing RTMPT ...')
2151 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2154 # extract non rtmp videos
2155 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2157 raise ExtractorError(u'unable to extract url')
2158 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2160 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2161 video_file = compat_urllib_parse.unquote(video_file)
2163 if not video_file.endswith('f4m'):
2164 ppath, prefix = video_file.split('.')
2165 video_playpath = '%s:%s' % (prefix, ppath)
2166 video_hls_playlist = ''
2169 video_hls_playlist = (
2170 video_filepath + video_file
2171 ).replace('.f4m', '.m3u8')
2173 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2174 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2176 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2182 'tc_url': video_url,
2184 'upload_date': None,
2185 'title': video_title,
2187 'play_path': video_playpath,
2188 'video_file': video_file,
2189 'video_hls_playlist': video_hls_playlist,
2190 'player_url': video_swfobj,
2194 class ComedyCentralIE(InfoExtractor):
2195 """Information extractor for The Daily Show and Colbert Report """
2197 # urls can be abbreviations like :thedailyshow or :colbert
2198 # urls for episodes like:
2199 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2200 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2201 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2202 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2203 |(https?://)?(www\.)?
2204 (?P<showname>thedailyshow|colbertnation)\.com/
2205 (full-episodes/(?P<episode>.*)|
2207 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2208 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2211 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2213 _video_extensions = {
2221 _video_dimensions = {
2231 def suitable(cls, url):
2232 """Receives a URL and returns True if suitable for this IE."""
2233 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2235 def _print_formats(self, formats):
2236 print('Available formats:')
2238 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2241 def _real_extract(self, url):
2242 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2244 raise ExtractorError(u'Invalid URL: %s' % url)
2246 if mobj.group('shortname'):
2247 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2248 url = u'http://www.thedailyshow.com/full-episodes/'
2250 url = u'http://www.colbertnation.com/full-episodes/'
2251 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252 assert mobj is not None
2254 if mobj.group('clip'):
2255 if mobj.group('showname') == 'thedailyshow':
2256 epTitle = mobj.group('tdstitle')
2258 epTitle = mobj.group('cntitle')
2261 dlNewest = not mobj.group('episode')
2263 epTitle = mobj.group('showname')
2265 epTitle = mobj.group('episode')
2267 self.report_extraction(epTitle)
2268 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2270 url = htmlHandle.geturl()
2271 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273 raise ExtractorError(u'Invalid redirected URL: ' + url)
2274 if mobj.group('episode') == '':
2275 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2276 epTitle = mobj.group('episode')
2278 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2280 if len(mMovieParams) == 0:
2281 # The Colbert Report embeds the information in a without
2282 # a URL prefix; so extract the alternate reference
2283 # and then add the URL prefix manually.
2285 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2286 if len(altMovieParams) == 0:
2287 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2289 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2291 uri = mMovieParams[0][1]
2292 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2293 indexXml = self._download_webpage(indexUrl, epTitle,
2294 u'Downloading show index',
2295 u'unable to download episode index')
2299 idoc = xml.etree.ElementTree.fromstring(indexXml)
2300 itemEls = idoc.findall('.//item')
2301 for partNum,itemEl in enumerate(itemEls):
2302 mediaId = itemEl.findall('./guid')[0].text
2303 shortMediaId = mediaId.split(':')[-1]
2304 showId = mediaId.split(':')[-2].replace('.com', '')
2305 officialTitle = itemEl.findall('./title')[0].text
2306 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2308 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2309 compat_urllib_parse.urlencode({'uri': mediaId}))
2310 configXml = self._download_webpage(configUrl, epTitle,
2311 u'Downloading configuration for %s' % shortMediaId)
2313 cdoc = xml.etree.ElementTree.fromstring(configXml)
2315 for rendition in cdoc.findall('.//rendition'):
2316 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2320 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2323 if self._downloader.params.get('listformats', None):
2324 self._print_formats([i[0] for i in turls])
2327 # For now, just pick the highest bitrate
2328 format,rtmp_video_url = turls[-1]
2330 # Get the format arg from the arg stream
2331 req_format = self._downloader.params.get('format', None)
2333 # Select format if we can find one
2336 format, rtmp_video_url = f, v
2339 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2341 raise ExtractorError(u'Cannot transform RTMP url')
2342 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2343 video_url = base + m.group('finalid')
2345 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2350 'upload_date': officialDate,
2355 'description': officialTitle,
2357 results.append(info)
2362 class EscapistIE(InfoExtractor):
2363 """Information extractor for The Escapist """
2365 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2366 IE_NAME = u'escapist'
2368 def _real_extract(self, url):
2369 mobj = re.match(self._VALID_URL, url)
2371 raise ExtractorError(u'Invalid URL: %s' % url)
2372 showName = mobj.group('showname')
2373 videoId = mobj.group('episode')
2375 self.report_extraction(videoId)
2376 webpage = self._download_webpage(url, videoId)
2378 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2379 webpage, u'description', fatal=False)
2381 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2382 webpage, u'thumbnail', fatal=False)
2384 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2385 webpage, u'player url')
2387 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2388 webpage, u'player url').split(' : ')[-1]
2390 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2391 configUrl = compat_urllib_parse.unquote(configUrl)
2393 configJSON = self._download_webpage(configUrl, videoId,
2394 u'Downloading configuration',
2395 u'unable to download configuration')
2397 # Technically, it's JavaScript, not JSON
2398 configJSON = configJSON.replace("'", '"')
2401 config = json.loads(configJSON)
2402 except (ValueError,) as err:
2403 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2405 playlist = config['playlist']
2406 videoUrl = playlist[1]['url']
2411 'uploader': showName,
2412 'upload_date': None,
2415 'thumbnail': imgUrl,
2416 'description': videoDesc,
2417 'player_url': playerUrl,
2422 class CollegeHumorIE(InfoExtractor):
2423 """Information extractor for collegehumor.com"""
2426 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2427 IE_NAME = u'collegehumor'
2429 def report_manifest(self, video_id):
2430 """Report information extraction."""
2431 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2433 def _real_extract(self, url):
2434 mobj = re.match(self._VALID_URL, url)
2436 raise ExtractorError(u'Invalid URL: %s' % url)
2437 video_id = mobj.group('videoid')
2442 'upload_date': None,
2445 self.report_extraction(video_id)
2446 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2448 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2449 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2452 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2454 videoNode = mdoc.findall('./video')[0]
2455 info['description'] = videoNode.findall('./description')[0].text
2456 info['title'] = videoNode.findall('./caption')[0].text
2457 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2458 manifest_url = videoNode.findall('./file')[0].text
2460 raise ExtractorError(u'Invalid metadata XML file')
2462 manifest_url += '?hdcore=2.10.3'
2463 self.report_manifest(video_id)
2465 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2469 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2471 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2472 node_id = media_node.attrib['url']
2473 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2474 except IndexError as err:
2475 raise ExtractorError(u'Invalid manifest file')
2477 url_pr = compat_urllib_parse_urlparse(manifest_url)
2478 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2485 class XVideosIE(InfoExtractor):
2486 """Information extractor for xvideos.com"""
2488 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2489 IE_NAME = u'xvideos'
2491 def _real_extract(self, url):
2492 mobj = re.match(self._VALID_URL, url)
2494 raise ExtractorError(u'Invalid URL: %s' % url)
2495 video_id = mobj.group(1)
2497 webpage = self._download_webpage(url, video_id)
2499 self.report_extraction(video_id)
2502 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2503 webpage, u'video URL'))
2506 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2509 # Extract video thumbnail
2510 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2511 webpage, u'thumbnail', fatal=False)
2517 'upload_date': None,
2518 'title': video_title,
2520 'thumbnail': video_thumbnail,
2521 'description': None,
2527 class SoundcloudIE(InfoExtractor):
2528 """Information extractor for soundcloud.com
2529 To access the media, the uid of the song and a stream token
2530 must be extracted from the page source and the script must make
2531 a request to media.soundcloud.com/crossdomain.xml. Then
2532 the media can be grabbed by requesting from an url composed
2533 of the stream token and uid
2536 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2537 IE_NAME = u'soundcloud'
2539 def report_resolve(self, video_id):
2540 """Report information extraction."""
2541 self.to_screen(u'%s: Resolving id' % video_id)
2543 def _real_extract(self, url):
2544 mobj = re.match(self._VALID_URL, url)
2546 raise ExtractorError(u'Invalid URL: %s' % url)
2548 # extract uploader (which is in the url)
2549 uploader = mobj.group(1)
2550 # extract simple title (uploader + slug of song title)
2551 slug_title = mobj.group(2)
2552 simple_title = uploader + u'-' + slug_title
2553 full_title = '%s/%s' % (uploader, slug_title)
2555 self.report_resolve(full_title)
2557 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2558 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2559 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2561 info = json.loads(info_json)
2562 video_id = info['id']
2563 self.report_extraction(full_title)
2565 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2566 stream_json = self._download_webpage(streams_url, full_title,
2567 u'Downloading stream definitions',
2568 u'unable to download stream definitions')
2570 streams = json.loads(stream_json)
2571 mediaURL = streams['http_mp3_128_url']
2572 upload_date = unified_strdate(info['created_at'])
2577 'uploader': info['user']['username'],
2578 'upload_date': upload_date,
2579 'title': info['title'],
2581 'description': info['description'],
2584 class SoundcloudSetIE(InfoExtractor):
2585 """Information extractor for soundcloud.com sets
2586 To access the media, the uid of the song and a stream token
2587 must be extracted from the page source and the script must make
2588 a request to media.soundcloud.com/crossdomain.xml. Then
2589 the media can be grabbed by requesting from an url composed
2590 of the stream token and uid
2593 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2594 IE_NAME = u'soundcloud:set'
2596 def report_resolve(self, video_id):
2597 """Report information extraction."""
2598 self.to_screen(u'%s: Resolving id' % video_id)
2600 def _real_extract(self, url):
2601 mobj = re.match(self._VALID_URL, url)
2603 raise ExtractorError(u'Invalid URL: %s' % url)
2605 # extract uploader (which is in the url)
2606 uploader = mobj.group(1)
2607 # extract simple title (uploader + slug of song title)
2608 slug_title = mobj.group(2)
2609 simple_title = uploader + u'-' + slug_title
2610 full_title = '%s/sets/%s' % (uploader, slug_title)
2612 self.report_resolve(full_title)
2614 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2615 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2616 info_json = self._download_webpage(resolv_url, full_title)
2619 info = json.loads(info_json)
2620 if 'errors' in info:
2621 for err in info['errors']:
2622 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2625 self.report_extraction(full_title)
2626 for track in info['tracks']:
2627 video_id = track['id']
2629 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2630 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2632 self.report_extraction(video_id)
2633 streams = json.loads(stream_json)
2634 mediaURL = streams['http_mp3_128_url']
2639 'uploader': track['user']['username'],
2640 'upload_date': unified_strdate(track['created_at']),
2641 'title': track['title'],
2643 'description': track['description'],
2648 class InfoQIE(InfoExtractor):
2649 """Information extractor for infoq.com"""
2650 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2652 def _real_extract(self, url):
2653 mobj = re.match(self._VALID_URL, url)
2655 raise ExtractorError(u'Invalid URL: %s' % url)
2657 webpage = self._download_webpage(url, video_id=url)
2658 self.report_extraction(url)
2661 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2663 raise ExtractorError(u'Unable to extract video url')
2664 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2665 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2668 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2671 # Extract description
2672 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2673 webpage, u'description', fatal=False)
2675 video_filename = video_url.split('/')[-1]
2676 video_id, extension = video_filename.split('.')
2682 'upload_date': None,
2683 'title': video_title,
2684 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2686 'description': video_description,
2691 class MixcloudIE(InfoExtractor):
2692 """Information extractor for www.mixcloud.com"""
2694 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2695 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2696 IE_NAME = u'mixcloud'
2698 def report_download_json(self, file_id):
2699 """Report JSON download."""
2700 self.to_screen(u'Downloading json')
2702 def get_urls(self, jsonData, fmt, bitrate='best'):
2703 """Get urls from 'audio_formats' section in json"""
2706 bitrate_list = jsonData[fmt]
2707 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2708 bitrate = max(bitrate_list) # select highest
2710 url_list = jsonData[fmt][bitrate]
2711 except TypeError: # we have no bitrate info.
2712 url_list = jsonData[fmt]
2715 def check_urls(self, url_list):
2716 """Returns 1st active url from list"""
2717 for url in url_list:
2719 compat_urllib_request.urlopen(url)
2721 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2726 def _print_formats(self, formats):
2727 print('Available formats:')
2728 for fmt in formats.keys():
2729 for b in formats[fmt]:
2731 ext = formats[fmt][b][0]
2732 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2733 except TypeError: # we have no bitrate info
2734 ext = formats[fmt][0]
2735 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2738 def _real_extract(self, url):
2739 mobj = re.match(self._VALID_URL, url)
2741 raise ExtractorError(u'Invalid URL: %s' % url)
2742 # extract uploader & filename from url
2743 uploader = mobj.group(1).decode('utf-8')
2744 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2746 # construct API request
2747 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2748 # retrieve .json file with links to files
2749 request = compat_urllib_request.Request(file_url)
2751 self.report_download_json(file_url)
2752 jsonData = compat_urllib_request.urlopen(request).read()
2753 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2754 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2757 json_data = json.loads(jsonData)
2758 player_url = json_data['player_swf_url']
2759 formats = dict(json_data['audio_formats'])
2761 req_format = self._downloader.params.get('format', None)
2764 if self._downloader.params.get('listformats', None):
2765 self._print_formats(formats)
2768 if req_format is None or req_format == 'best':
2769 for format_param in formats.keys():
2770 url_list = self.get_urls(formats, format_param)
2772 file_url = self.check_urls(url_list)
2773 if file_url is not None:
2776 if req_format not in formats:
2777 raise ExtractorError(u'Format is not available')
2779 url_list = self.get_urls(formats, req_format)
2780 file_url = self.check_urls(url_list)
2781 format_param = req_format
2784 'id': file_id.decode('utf-8'),
2785 'url': file_url.decode('utf-8'),
2786 'uploader': uploader.decode('utf-8'),
2787 'upload_date': None,
2788 'title': json_data['name'],
2789 'ext': file_url.split('.')[-1].decode('utf-8'),
2790 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2791 'thumbnail': json_data['thumbnail_url'],
2792 'description': json_data['description'],
2793 'player_url': player_url.decode('utf-8'),
2796 class StanfordOpenClassroomIE(InfoExtractor):
2797 """Information extractor for Stanford's Open ClassRoom"""
2799 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2800 IE_NAME = u'stanfordoc'
2802 def _real_extract(self, url):
2803 mobj = re.match(self._VALID_URL, url)
2805 raise ExtractorError(u'Invalid URL: %s' % url)
2807 if mobj.group('course') and mobj.group('video'): # A specific video
2808 course = mobj.group('course')
2809 video = mobj.group('video')
2811 'id': course + '_' + video,
2813 'upload_date': None,
2816 self.report_extraction(info['id'])
2817 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2818 xmlUrl = baseUrl + video + '.xml'
2820 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2821 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2822 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2823 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2825 info['title'] = mdoc.findall('./title')[0].text
2826 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2828 raise ExtractorError(u'Invalid metadata XML file')
2829 info['ext'] = info['url'].rpartition('.')[2]
2831 elif mobj.group('course'): # A course page
2832 course = mobj.group('course')
2837 'upload_date': None,
2840 coursepage = self._download_webpage(url, info['id'],
2841 note='Downloading course info page',
2842 errnote='Unable to download course info page')
2844 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2846 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2847 coursepage, u'description', fatal=False)
2849 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2852 'type': 'reference',
2853 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2857 for entry in info['list']:
2858 assert entry['type'] == 'reference'
2859 results += self.extract(entry['url'])
2863 'id': 'Stanford OpenClassroom',
2866 'upload_date': None,
2869 self.report_download_webpage(info['id'])
2870 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2872 rootpage = compat_urllib_request.urlopen(rootURL).read()
2873 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2874 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2876 info['title'] = info['id']
2878 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2881 'type': 'reference',
2882 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2887 for entry in info['list']:
2888 assert entry['type'] == 'reference'
2889 results += self.extract(entry['url'])
2892 class MTVIE(InfoExtractor):
2893 """Information extractor for MTV.com"""
2895 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2898 def _real_extract(self, url):
2899 mobj = re.match(self._VALID_URL, url)
2901 raise ExtractorError(u'Invalid URL: %s' % url)
2902 if not mobj.group('proto'):
2903 url = 'http://' + url
2904 video_id = mobj.group('videoid')
2906 webpage = self._download_webpage(url, video_id)
2908 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2909 webpage, u'song name', fatal=False)
2911 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2914 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2915 webpage, u'mtvn_uri', fatal=False)
2917 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2918 webpage, u'content id', fatal=False)
2920 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2921 self.report_extraction(video_id)
2922 request = compat_urllib_request.Request(videogen_url)
2924 metadataXml = compat_urllib_request.urlopen(request).read()
2925 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2926 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2928 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2929 renditions = mdoc.findall('.//rendition')
2931 # For now, always pick the highest quality.
2932 rendition = renditions[-1]
2935 _,_,ext = rendition.attrib['type'].partition('/')
2936 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2937 video_url = rendition.find('./src').text
2939 raise ExtractorError('Invalid rendition field.')
2944 'uploader': performer,
2945 'upload_date': None,
2946 'title': video_title,
2954 class YoukuIE(InfoExtractor):
2955 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2958 nowTime = int(time.time() * 1000)
2959 random1 = random.randint(1000,1998)
2960 random2 = random.randint(1000,9999)
2962 return "%d%d%d" %(nowTime,random1,random2)
2964 def _get_file_ID_mix_string(self, seed):
2966 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2968 for i in range(len(source)):
2969 seed = (seed * 211 + 30031 ) % 65536
2970 index = math.floor(seed / 65536 * len(source) )
2971 mixed.append(source[int(index)])
2972 source.remove(source[int(index)])
2973 #return ''.join(mixed)
2976 def _get_file_id(self, fileId, seed):
2977 mixed = self._get_file_ID_mix_string(seed)
2978 ids = fileId.split('*')
2982 realId.append(mixed[int(ch)])
2983 return ''.join(realId)
2985 def _real_extract(self, url):
2986 mobj = re.match(self._VALID_URL, url)
2988 raise ExtractorError(u'Invalid URL: %s' % url)
2989 video_id = mobj.group('ID')
2991 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2993 jsondata = self._download_webpage(info_url, video_id)
2995 self.report_extraction(video_id)
2997 config = json.loads(jsondata)
2999 video_title = config['data'][0]['title']
3000 seed = config['data'][0]['seed']
3002 format = self._downloader.params.get('format', None)
3003 supported_format = list(config['data'][0]['streamfileids'].keys())
3005 if format is None or format == 'best':
3006 if 'hd2' in supported_format:
3011 elif format == 'worst':
3019 fileid = config['data'][0]['streamfileids'][format]
3020 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3021 except (UnicodeDecodeError, ValueError, KeyError):
3022 raise ExtractorError(u'Unable to extract info section')
3025 sid = self._gen_sid()
3026 fileid = self._get_file_id(fileid, seed)
3028 #column 8,9 of fileid represent the segment number
3029 #fileid[7:9] should be changed
3030 for index, key in enumerate(keys):
3032 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3033 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3036 'id': '%s_part%02d' % (video_id, index),
3037 'url': download_url,
3039 'upload_date': None,
3040 'title': video_title,
3043 files_info.append(info)
3048 class XNXXIE(InfoExtractor):
3049 """Information extractor for xnxx.com"""
3051 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3053 VIDEO_URL_RE = r'flv_url=(.*?)&'
3054 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3055 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3057 def _real_extract(self, url):
3058 mobj = re.match(self._VALID_URL, url)
3060 raise ExtractorError(u'Invalid URL: %s' % url)
3061 video_id = mobj.group(1)
3063 # Get webpage content
3064 webpage = self._download_webpage(url, video_id)
3066 video_url = self._search_regex(self.VIDEO_URL_RE,
3067 webpage, u'video URL')
3068 video_url = compat_urllib_parse.unquote(video_url)
3070 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3073 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3074 webpage, u'thumbnail', fatal=False)
3080 'upload_date': None,
3081 'title': video_title,
3083 'thumbnail': video_thumbnail,
3084 'description': None,
3088 class GooglePlusIE(InfoExtractor):
3089 """Information extractor for plus.google.com."""
3091 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3092 IE_NAME = u'plus.google'
3094 def _real_extract(self, url):
3095 # Extract id from URL
3096 mobj = re.match(self._VALID_URL, url)
3098 raise ExtractorError(u'Invalid URL: %s' % url)
3100 post_url = mobj.group(0)
3101 video_id = mobj.group(1)
3103 video_extension = 'flv'
3105 # Step 1, Retrieve post webpage to extract further information
3106 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3108 self.report_extraction(video_id)
3110 # Extract update date
3111 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3112 webpage, u'upload date', fatal=False)
3114 # Convert timestring to a format suitable for filename
3115 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3116 upload_date = upload_date.strftime('%Y%m%d')
3119 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3120 webpage, u'uploader', fatal=False)
3123 # Get the first line for title
3124 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3125 webpage, 'title', default=u'NA')
3127 # Step 2, Stimulate clicking the image box to launch video
3128 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3129 webpage, u'video page URL')
3130 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3132 # Extract video links on video page
3133 """Extract video links of all sizes"""
3134 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3135 mobj = re.findall(pattern, webpage)
3137 raise ExtractorError(u'Unable to extract video links')
3139 # Sort in resolution
3140 links = sorted(mobj)
3142 # Choose the lowest of the sort, i.e. highest resolution
3143 video_url = links[-1]
3144 # Only get the url. The resolution part in the tuple has no use anymore
3145 video_url = video_url[-1]
3146 # Treat escaped \u0026 style hex
3148 video_url = video_url.decode("unicode_escape")
3149 except AttributeError: # Python 3
3150 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3156 'uploader': uploader,
3157 'upload_date': upload_date,
3158 'title': video_title,
3159 'ext': video_extension,
3162 class NBAIE(InfoExtractor):
3163 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3166 def _real_extract(self, url):
3167 mobj = re.match(self._VALID_URL, url)
3169 raise ExtractorError(u'Invalid URL: %s' % url)
3171 video_id = mobj.group(1)
3173 webpage = self._download_webpage(url, video_id)
3175 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3177 shortened_video_id = video_id.rpartition('/')[2]
3178 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3179 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3181 # It isn't there in the HTML it returns to us
3182 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3184 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3187 'id': shortened_video_id,
3191 # 'uploader_date': uploader_date,
3192 'description': description,
3196 class JustinTVIE(InfoExtractor):
3197 """Information extractor for justin.tv and twitch.tv"""
3198 # TODO: One broadcast may be split into multiple videos. The key
3199 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3200 # starts at 1 and increases. Can we treat all parts as one video?
3202 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3204 (?P<channelid>[^/]+)|
3205 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3206 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3210 _JUSTIN_PAGE_LIMIT = 100
3211 IE_NAME = u'justin.tv'
3213 def report_download_page(self, channel, offset):
3214 """Report attempt to download a single page of videos."""
3215 self.to_screen(u'%s: Downloading video information from %d to %d' %
3216 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3218 # Return count of items, list of *valid* items
3219 def _parse_page(self, url, video_id):
3220 webpage = self._download_webpage(url, video_id,
3221 u'Downloading video info JSON',
3222 u'unable to download video info JSON')
3224 response = json.loads(webpage)
3225 if type(response) != list:
3226 error_text = response.get('error', 'unknown error')
3227 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3229 for clip in response:
3230 video_url = clip['video_file_url']
3232 video_extension = os.path.splitext(video_url)[1][1:]
3233 video_date = re.sub('-', '', clip['start_time'][:10])
3234 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3235 video_id = clip['id']
3236 video_title = clip.get('title', video_id)
3240 'title': video_title,
3241 'uploader': clip.get('channel_name', video_uploader_id),
3242 'uploader_id': video_uploader_id,
3243 'upload_date': video_date,
3244 'ext': video_extension,
3246 return (len(response), info)
3248 def _real_extract(self, url):
3249 mobj = re.match(self._VALID_URL, url)
3251 raise ExtractorError(u'invalid URL: %s' % url)
3253 api_base = 'http://api.justin.tv'
3255 if mobj.group('channelid'):
3257 video_id = mobj.group('channelid')
3258 api = api_base + '/channel/archives/%s.json' % video_id
3259 elif mobj.group('chapterid'):
3260 chapter_id = mobj.group('chapterid')
3262 webpage = self._download_webpage(url, chapter_id)
3263 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3265 raise ExtractorError(u'Cannot find archive of a chapter')
3266 archive_id = m.group(1)
3268 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3269 chapter_info_xml = self._download_webpage(api, chapter_id,
3270 note=u'Downloading chapter information',
3271 errnote=u'Chapter information download failed')
3272 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3273 for a in doc.findall('.//archive'):
3274 if archive_id == a.find('./id').text:
3277 raise ExtractorError(u'Could not find chapter in chapter information')
3279 video_url = a.find('./video_file_url').text
3280 video_ext = video_url.rpartition('.')[2] or u'flv'
3282 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3283 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3284 note='Downloading chapter metadata',
3285 errnote='Download of chapter metadata failed')
3286 chapter_info = json.loads(chapter_info_json)
3288 bracket_start = int(doc.find('.//bracket_start').text)
3289 bracket_end = int(doc.find('.//bracket_end').text)
3291 # TODO determine start (and probably fix up file)
3292 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3293 #video_url += u'?start=' + TODO:start_timestamp
3294 # bracket_start is 13290, but we want 51670615
3295 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3296 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3299 'id': u'c' + chapter_id,
3302 'title': chapter_info['title'],
3303 'thumbnail': chapter_info['preview'],
3304 'description': chapter_info['description'],
3305 'uploader': chapter_info['channel']['display_name'],
3306 'uploader_id': chapter_info['channel']['name'],
3310 video_id = mobj.group('videoid')
3311 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3313 self.report_extraction(video_id)
3317 limit = self._JUSTIN_PAGE_LIMIT
3320 self.report_download_page(video_id, offset)
3321 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3322 page_count, page_info = self._parse_page(page_url, video_id)
3323 info.extend(page_info)
3324 if not paged or page_count != limit:
3329 class FunnyOrDieIE(InfoExtractor):
3330 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3332 def _real_extract(self, url):
3333 mobj = re.match(self._VALID_URL, url)
3335 raise ExtractorError(u'invalid URL: %s' % url)
3337 video_id = mobj.group('id')
3338 webpage = self._download_webpage(url, video_id)
3340 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3341 webpage, u'video URL', flags=re.DOTALL)
3343 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3344 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3346 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3347 webpage, u'description', fatal=False, flags=re.DOTALL)
3354 'description': video_description,
3358 class SteamIE(InfoExtractor):
3359 _VALID_URL = r"""http://store\.steampowered\.com/
3361 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3363 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3365 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3366 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3369 def suitable(cls, url):
3370 """Receives a URL and returns True if suitable for this IE."""
3371 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3373 def _real_extract(self, url):
3374 m = re.match(self._VALID_URL, url, re.VERBOSE)
3375 gameID = m.group('gameID')
3377 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3378 webpage = self._download_webpage(videourl, gameID)
3380 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3381 videourl = self._AGECHECK_TEMPLATE % gameID
3382 self.report_age_confirmation()
3383 webpage = self._download_webpage(videourl, gameID)
3385 self.report_extraction(gameID)
3386 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3387 webpage, 'game title')
3389 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3390 mweb = re.finditer(urlRE, webpage)
3391 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3392 titles = re.finditer(namesRE, webpage)
3393 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3394 thumbs = re.finditer(thumbsRE, webpage)
3396 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3397 video_id = vid.group('videoID')
3398 title = vtitle.group('videoName')
3399 video_url = vid.group('videoURL')
3400 video_thumb = thumb.group('thumbnail')
3402 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3407 'title': unescapeHTML(title),
3408 'thumbnail': video_thumb
3411 return [self.playlist_result(videos, gameID, game_title)]
3413 class UstreamIE(InfoExtractor):
3414 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3415 IE_NAME = u'ustream'
3417 def _real_extract(self, url):
3418 m = re.match(self._VALID_URL, url)
3419 video_id = m.group('videoID')
3421 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3422 webpage = self._download_webpage(url, video_id)
3424 self.report_extraction(video_id)
3426 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3429 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3430 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3432 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3433 webpage, u'thumbnail', fatal=False)
3439 'title': video_title,
3440 'uploader': uploader,
3441 'thumbnail': thumbnail,
3445 class WorldStarHipHopIE(InfoExtractor):
3446 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3447 IE_NAME = u'WorldStarHipHop'
3449 def _real_extract(self, url):
3450 m = re.match(self._VALID_URL, url)
3451 video_id = m.group('id')
3453 webpage_src = self._download_webpage(url, video_id)
3455 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3456 webpage_src, u'video URL')
3458 if 'mp4' in video_url:
3463 video_title = self._html_search_regex(r"<title>(.*)</title>",
3464 webpage_src, u'title')
3466 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3467 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3468 webpage_src, u'thumbnail', fatal=False)
3471 _title = r"""candytitles.*>(.*)</span>"""
3472 mobj = re.search(_title, webpage_src)
3473 if mobj is not None:
3474 video_title = mobj.group(1)
3479 'title' : video_title,
3480 'thumbnail' : thumbnail,
3485 class RBMARadioIE(InfoExtractor):
3486 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3488 def _real_extract(self, url):
3489 m = re.match(self._VALID_URL, url)
3490 video_id = m.group('videoID')
3492 webpage = self._download_webpage(url, video_id)
3494 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3495 webpage, u'json data', flags=re.MULTILINE)
3498 data = json.loads(json_data)
3499 except ValueError as e:
3500 raise ExtractorError(u'Invalid JSON: ' + str(e))
3502 video_url = data['akamai_url'] + '&cbr=256'
3503 url_parts = compat_urllib_parse_urlparse(video_url)
3504 video_ext = url_parts.path.rpartition('.')[2]
3509 'title': data['title'],
3510 'description': data.get('teaser_text'),
3511 'location': data.get('country_of_origin'),
3512 'uploader': data.get('host', {}).get('name'),
3513 'uploader_id': data.get('host', {}).get('slug'),
3514 'thumbnail': data.get('image', {}).get('large_url_2x'),
3515 'duration': data.get('duration'),
3520 class YouPornIE(InfoExtractor):
3521 """Information extractor for youporn.com."""
3522 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3524 def _print_formats(self, formats):
3525 """Print all available formats"""
3526 print(u'Available formats:')
3527 print(u'ext\t\tformat')
3528 print(u'---------------------------------')
3529 for format in formats:
3530 print(u'%s\t\t%s' % (format['ext'], format['format']))
3532 def _specific(self, req_format, formats):
3534 if(x["format"]==req_format):
3538 def _real_extract(self, url):
3539 mobj = re.match(self._VALID_URL, url)
3541 raise ExtractorError(u'Invalid URL: %s' % url)
3542 video_id = mobj.group('videoid')
3544 req = compat_urllib_request.Request(url)
3545 req.add_header('Cookie', 'age_verified=1')
3546 webpage = self._download_webpage(req, video_id)
3548 # Get JSON parameters
3549 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3551 params = json.loads(json_params)
3553 raise ExtractorError(u'Invalid JSON')
3555 self.report_extraction(video_id)
3557 video_title = params['title']
3558 upload_date = unified_strdate(params['release_date_f'])
3559 video_description = params['description']
3560 video_uploader = params['submitted_by']
3561 thumbnail = params['thumbnails'][0]['image']
3563 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3565 # Get all of the formats available
3566 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3567 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3568 webpage, u'download list').strip()
3570 # Get all of the links from the page
3571 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3572 links = re.findall(LINK_RE, download_list_html)
3573 if(len(links) == 0):
3574 raise ExtractorError(u'ERROR: no known formats available for video')
3576 self.to_screen(u'Links found: %d' % len(links))
3581 # A link looks like this:
3582 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3583 # A path looks like this:
3584 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3585 video_url = unescapeHTML( link )
3586 path = compat_urllib_parse_urlparse( video_url ).path
3587 extension = os.path.splitext( path )[1][1:]
3588 format = path.split('/')[4].split('_')[:2]
3591 format = "-".join( format )
3592 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3597 'uploader': video_uploader,
3598 'upload_date': upload_date,
3599 'title': video_title,
3602 'thumbnail': thumbnail,
3603 'description': video_description
3606 if self._downloader.params.get('listformats', None):
3607 self._print_formats(formats)
3610 req_format = self._downloader.params.get('format', None)
3611 self.to_screen(u'Format: %s' % req_format)
3613 if req_format is None or req_format == 'best':
3615 elif req_format == 'worst':
3616 return [formats[-1]]
3617 elif req_format in ('-1', 'all'):
3620 format = self._specific( req_format, formats )
3622 raise ExtractorError(u'Requested format not available')
3627 class PornotubeIE(InfoExtractor):
3628 """Information extractor for pornotube.com."""
3629 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3631 def _real_extract(self, url):
3632 mobj = re.match(self._VALID_URL, url)
3634 raise ExtractorError(u'Invalid URL: %s' % url)
3636 video_id = mobj.group('videoid')
3637 video_title = mobj.group('title')
3639 # Get webpage content
3640 webpage = self._download_webpage(url, video_id)
3643 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3644 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3645 video_url = compat_urllib_parse.unquote(video_url)
3647 #Get the uploaded date
3648 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3649 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3650 if upload_date: upload_date = unified_strdate(upload_date)
3652 info = {'id': video_id,
3655 'upload_date': upload_date,
3656 'title': video_title,
3662 class YouJizzIE(InfoExtractor):
3663 """Information extractor for youjizz.com."""
3664 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3666 def _real_extract(self, url):
3667 mobj = re.match(self._VALID_URL, url)
3669 raise ExtractorError(u'Invalid URL: %s' % url)
3671 video_id = mobj.group('videoid')
3673 # Get webpage content
3674 webpage = self._download_webpage(url, video_id)
3676 # Get the video title
3677 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3678 webpage, u'title').strip()
3680 # Get the embed page
3681 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3683 raise ExtractorError(u'ERROR: unable to extract embed page')
3685 embed_page_url = result.group(0).strip()
3686 video_id = result.group('videoid')
3688 webpage = self._download_webpage(embed_page_url, video_id)
3691 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3692 webpage, u'video URL')
3694 info = {'id': video_id,
3696 'title': video_title,
3699 'player_url': embed_page_url}
3703 class EightTracksIE(InfoExtractor):
3705 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3707 def _real_extract(self, url):
3708 mobj = re.match(self._VALID_URL, url)
3710 raise ExtractorError(u'Invalid URL: %s' % url)
3711 playlist_id = mobj.group('id')
3713 webpage = self._download_webpage(url, playlist_id)
3715 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3716 data = json.loads(json_like)
3718 session = str(random.randint(0, 1000000000))
3720 track_count = data['tracks_count']
3721 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3722 next_url = first_url
3724 for i in itertools.count():
3725 api_json = self._download_webpage(next_url, playlist_id,
3726 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3727 errnote=u'Failed to download song information')
3728 api_data = json.loads(api_json)
3729 track_data = api_data[u'set']['track']
3731 'id': track_data['id'],
3732 'url': track_data['track_file_stream_url'],
3733 'title': track_data['performer'] + u' - ' + track_data['name'],
3734 'raw_title': track_data['name'],
3735 'uploader_id': data['user']['login'],
3739 if api_data['set']['at_last_track']:
3741 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3744 class KeekIE(InfoExtractor):
3745 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3748 def _real_extract(self, url):
3749 m = re.match(self._VALID_URL, url)
3750 video_id = m.group('videoID')
3752 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3753 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3754 webpage = self._download_webpage(url, video_id)
3756 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3759 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3760 webpage, u'uploader', fatal=False)
3766 'title': video_title,
3767 'thumbnail': thumbnail,
3768 'uploader': uploader
3772 class TEDIE(InfoExtractor):
3773 _VALID_URL=r'''http://www\.ted\.com/
3775 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3777 ((?P<type_talk>talks)) # We have a simple talk
3779 (/lang/(.*?))? # The url may contain the language
3780 /(?P<name>\w+) # Here goes the name and then ".html"
3784 def suitable(cls, url):
3785 """Receives a URL and returns True if suitable for this IE."""
3786 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3788 def _real_extract(self, url):
3789 m=re.match(self._VALID_URL, url, re.VERBOSE)
3790 if m.group('type_talk'):
3791 return [self._talk_info(url)]
3793 playlist_id=m.group('playlist_id')
3794 name=m.group('name')
3795 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3796 return [self._playlist_videos_info(url,name,playlist_id)]
3798 def _playlist_videos_info(self,url,name,playlist_id=0):
3799 '''Returns the videos of the playlist'''
3801 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3802 ([.\s]*?)data-playlist_item_id="(\d+)"
3803 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3805 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3806 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3807 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3808 m_names=re.finditer(video_name_RE,webpage)
3810 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3811 webpage, 'playlist title')
3813 playlist_entries = []
3814 for m_video, m_name in zip(m_videos,m_names):
3815 video_id=m_video.group('video_id')
3816 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3817 playlist_entries.append(self.url_result(talk_url, 'TED'))
3818 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3820 def _talk_info(self, url, video_id=0):
3821 """Return the video for the talk in the url"""
3822 m = re.match(self._VALID_URL, url,re.VERBOSE)
3823 video_name = m.group('name')
3824 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3825 self.report_extraction(video_name)
3826 # If the url includes the language we get the title translated
3827 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3829 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3830 webpage, 'json data')
3831 info = json.loads(json_data)
3832 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3833 webpage, 'description', flags = re.DOTALL)
3835 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3836 webpage, 'thumbnail')
3839 'url': info['htmlStreams'][-1]['file'],
3842 'thumbnail': thumbnail,
3843 'description': desc,
3847 class MySpassIE(InfoExtractor):
3848 _VALID_URL = r'http://www.myspass.de/.*'
3850 def _real_extract(self, url):
3851 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3853 # video id is the last path element of the URL
3854 # usually there is a trailing slash, so also try the second but last
3855 url_path = compat_urllib_parse_urlparse(url).path
3856 url_parent_path, video_id = os.path.split(url_path)
3858 _, video_id = os.path.split(url_parent_path)
3861 metadata_url = META_DATA_URL_TEMPLATE % video_id
3862 metadata_text = self._download_webpage(metadata_url, video_id)
3863 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3865 # extract values from metadata
3866 url_flv_el = metadata.find('url_flv')
3867 if url_flv_el is None:
3868 raise ExtractorError(u'Unable to extract download url')
3869 video_url = url_flv_el.text
3870 extension = os.path.splitext(video_url)[1][1:]
3871 title_el = metadata.find('title')
3872 if title_el is None:
3873 raise ExtractorError(u'Unable to extract title')
3874 title = title_el.text
3875 format_id_el = metadata.find('format_id')
3876 if format_id_el is None:
3879 format = format_id_el.text
3880 description_el = metadata.find('description')
3881 if description_el is not None:
3882 description = description_el.text
3885 imagePreview_el = metadata.find('imagePreview')
3886 if imagePreview_el is not None:
3887 thumbnail = imagePreview_el.text
3896 'thumbnail': thumbnail,
3897 'description': description
3901 class SpiegelIE(InfoExtractor):
3902 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3904 def _real_extract(self, url):
3905 m = re.match(self._VALID_URL, url)
3906 video_id = m.group('videoID')
3908 webpage = self._download_webpage(url, video_id)
3910 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3913 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3914 xml_code = self._download_webpage(xml_url, video_id,
3915 note=u'Downloading XML', errnote=u'Failed to download XML')
3917 idoc = xml.etree.ElementTree.fromstring(xml_code)
3918 last_type = idoc[-1]
3919 filename = last_type.findall('./filename')[0].text
3920 duration = float(last_type.findall('./duration')[0].text)
3922 video_url = 'http://video2.spiegel.de/flash/' + filename
3923 video_ext = filename.rpartition('.')[2]
3928 'title': video_title,
3929 'duration': duration,
3933 class LiveLeakIE(InfoExtractor):
3935 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3936 IE_NAME = u'liveleak'
3938 def _real_extract(self, url):
3939 mobj = re.match(self._VALID_URL, url)
3941 raise ExtractorError(u'Invalid URL: %s' % url)
3943 video_id = mobj.group('video_id')
3945 webpage = self._download_webpage(url, video_id)
3947 video_url = self._search_regex(r'file: "(.*?)",',
3948 webpage, u'video URL')
3950 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3951 webpage, u'title').replace('LiveLeak.com -', '').strip()
3953 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3954 webpage, u'description', fatal=False)
3956 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3957 webpage, u'uploader', fatal=False)
3963 'title': video_title,
3964 'description': video_description,
3965 'uploader': video_uploader
3970 class ARDIE(InfoExtractor):
3971 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3972 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3973 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3975 def _real_extract(self, url):
3976 # determine video id from url
3977 m = re.match(self._VALID_URL, url)
3979 numid = re.search(r'documentId=([0-9]+)', url)
3981 video_id = numid.group(1)
3983 video_id = m.group('video_id')
3985 # determine title and media streams from webpage
3986 html = self._download_webpage(url, video_id)
3987 title = re.search(self._TITLE, html).group('title')
3988 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3990 assert '"fsk"' in html
3991 raise ExtractorError(u'This video is only available after 8:00 pm')
3993 # choose default media type and highest quality for now
3994 stream = max([s for s in streams if int(s["media_type"]) == 0],
3995 key=lambda s: int(s["quality"]))
3997 # there's two possibilities: RTMP stream or HTTP download
3998 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3999 if stream['rtmp_url']:
4000 self.to_screen(u'RTMP download detected')
4001 assert stream['video_url'].startswith('mp4:')
4002 info["url"] = stream["rtmp_url"]
4003 info["play_path"] = stream['video_url']
4005 assert stream["video_url"].endswith('.mp4')
4006 info["url"] = stream["video_url"]
4009 class ZDFIE(InfoExtractor):
4010 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4011 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4012 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4013 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4014 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4016 def _real_extract(self, url):
4017 mobj = re.match(self._VALID_URL, url)
4019 raise ExtractorError(u'Invalid URL: %s' % url)
4020 video_id = mobj.group('video_id')
4022 html = self._download_webpage(url, video_id)
4023 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4025 raise ExtractorError(u'No media url found.')
4027 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4028 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4029 # choose first/default media type and highest quality for now
4030 for s in streams: #find 300 - dsl1000mbit
4031 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4034 for s in streams: #find veryhigh - dsl2000mbit
4035 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4039 raise ExtractorError(u'No stream found.')
4041 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4043 self.report_extraction(video_id)
4044 mobj = re.search(self._TITLE, html)
4046 raise ExtractorError(u'Cannot extract title')
4047 title = unescapeHTML(mobj.group('title'))
4049 mobj = re.search(self._MMS_STREAM, media_link)
4051 mobj = re.search(self._RTSP_STREAM, media_link)
4053 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4054 mms_url = mobj.group('video_url')
4056 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4058 raise ExtractorError(u'Cannot extract extention')
4059 ext = mobj.group('ext')
4061 return [{'id': video_id,
4067 class TumblrIE(InfoExtractor):
4068 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4070 def _real_extract(self, url):
4071 m_url = re.match(self._VALID_URL, url)
4072 video_id = m_url.group('id')
4073 blog = m_url.group('blog_name')
4075 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4076 webpage = self._download_webpage(url, video_id)
4078 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4079 video = re.search(re_video, webpage)
4081 raise ExtractorError(u'Unable to extract video')
4082 video_url = video.group('video_url')
4083 ext = video.group('ext')
4085 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4086 webpage, u'thumbnail', fatal=False) # We pick the first poster
4087 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4089 # The only place where you can get a title, it's not complete,
4090 # but searching in other places doesn't work for all videos
4091 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4092 webpage, u'title', flags=re.DOTALL)
4094 return [{'id': video_id,
4096 'title': video_title,
4097 'thumbnail': video_thumbnail,
4101 class BandcampIE(InfoExtractor):
4102 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4104 def _real_extract(self, url):
4105 mobj = re.match(self._VALID_URL, url)
4106 title = mobj.group('title')
4107 webpage = self._download_webpage(url, title)
4108 # We get the link to the free download page
4109 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4110 if m_download is None:
4111 raise ExtractorError(u'No free songs found')
4113 download_link = m_download.group(1)
4114 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4115 webpage, re.MULTILINE|re.DOTALL).group('id')
4117 download_webpage = self._download_webpage(download_link, id,
4118 'Downloading free downloads page')
4119 # We get the dictionary of the track from some javascrip code
4120 info = re.search(r'items: (.*?),$',
4121 download_webpage, re.MULTILINE).group(1)
4122 info = json.loads(info)[0]
4123 # We pick mp3-320 for now, until format selection can be easily implemented.
4124 mp3_info = info[u'downloads'][u'mp3-320']
4125 # If we try to use this url it says the link has expired
4126 initial_url = mp3_info[u'url']
4127 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4128 m_url = re.match(re_url, initial_url)
4129 #We build the url we will use to get the final track url
4130 # This url is build in Bandcamp in the script download_bunde_*.js
4131 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4132 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4133 # If we could correctly generate the .rand field the url would be
4134 #in the "download_url" key
4135 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4137 track_info = {'id':id,
4138 'title' : info[u'title'],
4141 'thumbnail' : info[u'thumb_url'],
4142 'uploader' : info[u'artist']
4147 class RedTubeIE(InfoExtractor):
4148 """Information Extractor for redtube"""
4149 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4151 def _real_extract(self,url):
4152 mobj = re.match(self._VALID_URL, url)
4154 raise ExtractorError(u'Invalid URL: %s' % url)
4156 video_id = mobj.group('id')
4157 video_extension = 'mp4'
4158 webpage = self._download_webpage(url, video_id)
4160 self.report_extraction(video_id)
4162 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4163 webpage, u'video URL')
4165 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4171 'ext': video_extension,
4172 'title': video_title,
4175 class InaIE(InfoExtractor):
4176 """Information Extractor for Ina.fr"""
4177 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4179 def _real_extract(self,url):
4180 mobj = re.match(self._VALID_URL, url)
4182 video_id = mobj.group('id')
4183 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4184 video_extension = 'mp4'
4185 webpage = self._download_webpage(mrss_url, video_id)
4187 self.report_extraction(video_id)
4189 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4190 webpage, u'video URL')
4192 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4198 'ext': video_extension,
4199 'title': video_title,
4202 class HowcastIE(InfoExtractor):
4203 """Information Extractor for Howcast.com"""
4204 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4206 def _real_extract(self, url):
4207 mobj = re.match(self._VALID_URL, url)
4209 video_id = mobj.group('id')
4210 webpage_url = 'http://www.howcast.com/videos/' + video_id
4211 webpage = self._download_webpage(webpage_url, video_id)
4213 self.report_extraction(video_id)
4215 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4216 webpage, u'video URL')
4218 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4221 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4222 webpage, u'description', fatal=False)
4224 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4225 webpage, u'thumbnail', fatal=False)
4231 'title': video_title,
4232 'description': video_description,
4233 'thumbnail': thumbnail,
4236 class VineIE(InfoExtractor):
4237 """Information Extractor for Vine.co"""
4238 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4240 def _real_extract(self, url):
4241 mobj = re.match(self._VALID_URL, url)
4243 video_id = mobj.group('id')
4244 webpage_url = 'https://vine.co/v/' + video_id
4245 webpage = self._download_webpage(webpage_url, video_id)
4247 self.report_extraction(video_id)
4249 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4250 webpage, u'video URL')
4252 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4255 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4256 webpage, u'thumbnail', fatal=False)
4258 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4259 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4265 'title': video_title,
4266 'thumbnail': thumbnail,
4267 'uploader': uploader,
4270 class FlickrIE(InfoExtractor):
4271 """Information Extractor for Flickr videos"""
4272 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4274 def _real_extract(self, url):
4275 mobj = re.match(self._VALID_URL, url)
4277 video_id = mobj.group('id')
4278 video_uploader_id = mobj.group('uploader_id')
4279 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4280 webpage = self._download_webpage(webpage_url, video_id)
4282 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4284 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4285 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4287 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4288 first_xml, u'node_id')
4290 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4291 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4293 self.report_extraction(video_id)
4295 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4297 raise ExtractorError(u'Unable to extract video url')
4298 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4300 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4301 webpage, u'video title')
4303 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4304 webpage, u'description', fatal=False)
4306 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4307 webpage, u'thumbnail', fatal=False)
4313 'title': video_title,
4314 'description': video_description,
4315 'thumbnail': thumbnail,
4316 'uploader_id': video_uploader_id,
4319 class TeamcocoIE(InfoExtractor):
4320 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4322 def _real_extract(self, url):
4323 mobj = re.match(self._VALID_URL, url)
4325 raise ExtractorError(u'Invalid URL: %s' % url)
4326 url_title = mobj.group('url_title')
4327 webpage = self._download_webpage(url, url_title)
4329 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4330 webpage, u'video id')
4332 self.report_extraction(video_id)
4334 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4337 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4338 webpage, u'thumbnail', fatal=False)
4340 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4341 webpage, u'description', fatal=False)
4343 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4344 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4346 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4353 'title': video_title,
4354 'thumbnail': thumbnail,
4355 'description': video_description,
4358 class XHamsterIE(InfoExtractor):
4359 """Information Extractor for xHamster"""
4360 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4362 def _real_extract(self,url):
4363 mobj = re.match(self._VALID_URL, url)
4365 video_id = mobj.group('id')
4366 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4367 webpage = self._download_webpage(mrss_url, video_id)
4369 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4371 raise ExtractorError(u'Unable to extract media URL')
4372 if len(mobj.group('server')) == 0:
4373 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4375 video_url = mobj.group('server')+'/key='+mobj.group('file')
4376 video_extension = video_url.split('.')[-1]
4378 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4381 # Can't see the description anywhere in the UI
4382 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4383 # webpage, u'description', fatal=False)
4384 # if video_description: video_description = unescapeHTML(video_description)
4386 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4388 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4390 video_upload_date = None
4391 self._downloader.report_warning(u'Unable to extract upload date')
4393 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4394 webpage, u'uploader id', default=u'anonymous')
4396 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4397 webpage, u'thumbnail', fatal=False)
4402 'ext': video_extension,
4403 'title': video_title,
4404 # 'description': video_description,
4405 'upload_date': video_upload_date,
4406 'uploader_id': video_uploader_id,
4407 'thumbnail': video_thumbnail
4410 class HypemIE(InfoExtractor):
4411 """Information Extractor for hypem"""
4412 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4414 def _real_extract(self, url):
4415 mobj = re.match(self._VALID_URL, url)
4417 raise ExtractorError(u'Invalid URL: %s' % url)
4418 track_id = mobj.group(1)
4420 data = { 'ax': 1, 'ts': time.time() }
4421 data_encoded = compat_urllib_parse.urlencode(data)
4422 complete_url = url + "?" + data_encoded
4423 request = compat_urllib_request.Request(complete_url)
4424 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4425 cookie = urlh.headers.get('Set-Cookie', '')
4427 self.report_extraction(track_id)
4429 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4430 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4432 track_list = json.loads(html_tracks)
4433 track = track_list[u'tracks'][0]
4435 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4438 track_id = track[u"id"]
4439 artist = track[u"artist"]
4440 title = track[u"song"]
4442 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4443 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4444 request.add_header('cookie', cookie)
4445 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4447 song_data = json.loads(song_data_json)
4449 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4450 final_url = song_data[u"url"]
4460 class Vbox7IE(InfoExtractor):
4461 """Information Extractor for Vbox7"""
4462 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4464 def _real_extract(self,url):
4465 mobj = re.match(self._VALID_URL, url)
4467 raise ExtractorError(u'Invalid URL: %s' % url)
4468 video_id = mobj.group(1)
4470 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4471 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4472 redirect_url = urlh.geturl() + new_location
4473 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4475 title = self._html_search_regex(r'<title>(.*)</title>',
4476 webpage, u'title').split('/')[0].strip()
4479 info_url = "http://vbox7.com/play/magare.do"
4480 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4481 info_request = compat_urllib_request.Request(info_url, data)
4482 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4483 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4484 if info_response is None:
4485 raise ExtractorError(u'Unable to extract the media url')
4486 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4493 'thumbnail': thumbnail_url,
4496 class GametrailersIE(InfoExtractor):
4497 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4499 def _real_extract(self, url):
4500 mobj = re.match(self._VALID_URL, url)
4502 raise ExtractorError(u'Invalid URL: %s' % url)
4503 video_id = mobj.group('id')
4504 video_type = mobj.group('type')
4505 webpage = self._download_webpage(url, video_id)
4506 if video_type == 'full-episodes':
4507 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4509 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4510 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4511 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4513 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4514 video_id, u'Downloading video info')
4515 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4516 video_id, u'Downloading video urls info')
4518 self.report_extraction(video_id)
4519 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4520 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4522 <url>(?P<thumb>.*?)</url>.*
4525 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4527 raise ExtractorError(u'Unable to extract video info')
4528 video_title = m_info.group('title')
4529 video_description = m_info.group('description')
4530 video_thumb = m_info.group('thumb')
4532 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4533 if m_urls is None or len(m_urls) == 0:
4534 raise ExtractError(u'Unable to extrat video url')
4535 # They are sorted from worst to best quality
4536 video_url = m_urls[-1].group('url')
4538 return {'url': video_url,
4540 'title': video_title,
4541 # Videos are actually flv not mp4
4543 'thumbnail': video_thumb,
4544 'description': video_description,
4547 def gen_extractors():
4548 """ Return a list of an instance of every supported extractor.
4549 The order does matter; the first extractor matched is the one handling the URL.
4552 YoutubePlaylistIE(),
4577 StanfordOpenClassroomIE(),
4587 WorldStarHipHopIE(),
4616 def get_info_extractor(ie_name):
4617 """Returns the info extractor class with the given ie_name"""
4618 return globals()[ie_name+'IE']