2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
78 self.set_downloader(downloader)
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self._real_initialize()
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
99 return self._real_extract(url)
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self).__name__[:-2]
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
136 encoding = m.group(1)
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
142 url = url_or_request.get_full_url()
143 except AttributeError:
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
189 video_info['id'] = playlist_id
191 video_info['title'] = playlist_title
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
205 mobj = re.search(p, string, flags)
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
219 raise ExtractorError(u'Unable to extract %s' % _name)
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 return clean_html(res).strip()
235 class SearchInfoExtractor(InfoExtractor):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
253 raise ExtractorError(u'Invalid search query "%s"' % query)
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions = {
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
402 (error_message, sub_lang, sub)
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
411 url = 'http://www.youtube.com/api/timedtext?' + params
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
446 return [(err_msg, None, None)]
448 def _extract_subtitle(self, video_id):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
480 def _print_formats(self, formats):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
485 def _real_initialize(self):
486 if self._downloader is None:
491 downloader_params = self._downloader.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
510 request = compat_urllib_request.Request(self._LANG_URL)
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
518 # No authentication to be performed
522 request = compat_urllib_request.Request(self._LOGIN_URL)
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
533 galx = match.group(1)
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u'PersistentCookie': u'yes',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
555 u'signIn': u'Sign in',
557 u'service': u'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
579 'action_confirm': 'Confirm',
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
641 # Start extracting information
642 self.report_information_extraction(video_id)
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
653 video_uploader_id = mobj.group(1)
655 self._downloader.report_warning(u'unable to extract uploader nickname')
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
683 video_description = unescapeHTML(fd_mobj.group(1))
685 video_description = u''
688 video_subtitles = None
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
693 (sub_error, sub_lang, sub) = video_subtitles[0]
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
701 # We report the original error
702 self._downloader.report_error(sub_error)
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
709 self._downloader.report_error(sub_error)
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
736 if not 'ratebypass' in url: url += '&ratebypass=yes'
737 url_map[url_data['itag'][0]] = url
739 format_limit = self._downloader.params.get('format_limit', None)
740 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
741 if format_limit is not None and format_limit in available_formats:
742 format_list = available_formats[available_formats.index(format_limit):]
744 format_list = available_formats
745 existing_formats = [x for x in format_list if x in url_map]
746 if len(existing_formats) == 0:
747 raise ExtractorError(u'no known formats available for video')
748 if self._downloader.params.get('listformats', None):
749 self._print_formats(existing_formats)
751 if req_format is None or req_format == 'best':
752 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
753 elif req_format == 'worst':
754 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
755 elif req_format in ('-1', 'all'):
756 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
758 # Specific formats. We pick the first in a slash-delimeted sequence.
759 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
760 req_formats = req_format.split('/')
761 video_url_list = None
762 for rf in req_formats:
764 video_url_list = [(rf, url_map[rf])]
766 if video_url_list is None:
767 raise ExtractorError(u'requested format not available')
769 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
772 for format_param, video_real_url in video_url_list:
774 video_extension = self._video_extensions.get(format_param, 'flv')
776 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
777 self._video_dimensions.get(format_param, '???'))
781 'url': video_real_url,
782 'uploader': video_uploader,
783 'uploader_id': video_uploader_id,
784 'upload_date': upload_date,
785 'title': video_title,
786 'ext': video_extension,
787 'format': video_format,
788 'thumbnail': video_thumbnail,
789 'description': video_description,
790 'player_url': player_url,
791 'subtitles': video_subtitles,
792 'duration': video_duration
797 class MetacafeIE(InfoExtractor):
798 """Information Extractor for metacafe.com."""
800 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
801 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
802 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
803 IE_NAME = u'metacafe'
805 def report_disclaimer(self):
806 """Report disclaimer retrieval."""
807 self.to_screen(u'Retrieving disclaimer')
809 def _real_initialize(self):
810 # Retrieve disclaimer
811 request = compat_urllib_request.Request(self._DISCLAIMER)
813 self.report_disclaimer()
814 disclaimer = compat_urllib_request.urlopen(request).read()
815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
816 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
821 'submit': "Continue - I'm over 18",
823 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
825 self.report_age_confirmation()
826 disclaimer = compat_urllib_request.urlopen(request).read()
827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
830 def _real_extract(self, url):
831 # Extract id and simplified title from URL
832 mobj = re.match(self._VALID_URL, url)
834 raise ExtractorError(u'Invalid URL: %s' % url)
836 video_id = mobj.group(1)
838 # Check if video comes from YouTube
839 mobj2 = re.match(r'^yt-(.*)$', video_id)
840 if mobj2 is not None:
841 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
843 # Retrieve video webpage to extract further information
844 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
846 # Extract URL, uploader and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
850 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
851 video_extension = mediaURL[-3:]
853 # Extract gdaKey if available
854 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
858 gdaKey = mobj.group(1)
859 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
861 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
863 raise ExtractorError(u'Unable to extract media URL')
864 vardict = compat_parse_qs(mobj.group(1))
865 if 'mediaData' not in vardict:
866 raise ExtractorError(u'Unable to extract media URL')
867 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
869 raise ExtractorError(u'Unable to extract media URL')
870 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
871 video_extension = mediaURL[-3:]
872 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
874 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
876 raise ExtractorError(u'Unable to extract title')
877 video_title = mobj.group(1).decode('utf-8')
879 mobj = re.search(r'submitter=(.*?);', webpage)
881 raise ExtractorError(u'Unable to extract uploader nickname')
882 video_uploader = mobj.group(1)
885 'id': video_id.decode('utf-8'),
886 'url': video_url.decode('utf-8'),
887 'uploader': video_uploader.decode('utf-8'),
889 'title': video_title,
890 'ext': video_extension.decode('utf-8'),
893 class DailymotionIE(InfoExtractor):
894 """Information Extractor for Dailymotion"""
896 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
897 IE_NAME = u'dailymotion'
899 def _real_extract(self, url):
900 # Extract id and simplified title from URL
901 mobj = re.match(self._VALID_URL, url)
903 raise ExtractorError(u'Invalid URL: %s' % url)
905 video_id = mobj.group(1).split('_')[0].split('?')[0]
907 video_extension = 'mp4'
909 # Retrieve video webpage to extract further information
910 request = compat_urllib_request.Request(url)
911 request.add_header('Cookie', 'family_filter=off')
912 webpage = self._download_webpage(request, video_id)
914 # Extract URL, uploader and title from webpage
915 self.report_extraction(video_id)
916 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
918 raise ExtractorError(u'Unable to extract media URL')
919 flashvars = compat_urllib_parse.unquote(mobj.group(1))
921 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
924 self.to_screen(u'Using %s' % key)
927 raise ExtractorError(u'Unable to extract video URL')
929 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
931 raise ExtractorError(u'Unable to extract video URL')
933 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
935 # TODO: support choosing qualities
937 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
939 raise ExtractorError(u'Unable to extract title')
940 video_title = unescapeHTML(mobj.group('title'))
942 video_uploader = None
943 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
945 # lookin for official user
946 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
947 if mobj_official is None:
948 self._downloader.report_warning(u'unable to extract uploader nickname')
950 video_uploader = mobj_official.group(1)
952 video_uploader = mobj.group(1)
954 video_upload_date = None
955 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
957 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
962 'uploader': video_uploader,
963 'upload_date': video_upload_date,
964 'title': video_title,
965 'ext': video_extension,
969 class PhotobucketIE(InfoExtractor):
970 """Information extractor for photobucket.com."""
972 # TODO: the original _VALID_URL was:
973 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
974 # Check if it's necessary to keep the old extracion process
975 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
976 IE_NAME = u'photobucket'
978 def _real_extract(self, url):
979 # Extract id from URL
980 mobj = re.match(self._VALID_URL, url)
982 raise ExtractorError(u'Invalid URL: %s' % url)
984 video_id = mobj.group('id')
986 video_extension = mobj.group('ext')
988 # Retrieve video webpage to extract further information
989 webpage = self._download_webpage(url, video_id)
991 # Extract URL, uploader, and title from webpage
992 self.report_extraction(video_id)
993 # We try first by looking the javascript code:
994 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
996 info = json.loads(mobj.group('json'))
999 'url': info[u'downloadUrl'],
1000 'uploader': info[u'username'],
1001 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
1002 'title': info[u'title'],
1003 'ext': video_extension,
1004 'thumbnail': info[u'thumbUrl'],
1007 # We try looking in other parts of the webpage
1008 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1009 webpage, u'video URL')
1011 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1013 raise ExtractorError(u'Unable to extract title')
1014 video_title = mobj.group(1).decode('utf-8')
1015 video_uploader = mobj.group(2).decode('utf-8')
1018 'id': video_id.decode('utf-8'),
1019 'url': video_url.decode('utf-8'),
1020 'uploader': video_uploader,
1021 'upload_date': None,
1022 'title': video_title,
1023 'ext': video_extension.decode('utf-8'),
1027 class YahooIE(InfoExtractor):
1028 """Information extractor for screen.yahoo.com."""
1029 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1031 def _real_extract(self, url):
1032 mobj = re.match(self._VALID_URL, url)
1034 raise ExtractorError(u'Invalid URL: %s' % url)
1035 video_id = mobj.group('id')
1036 webpage = self._download_webpage(url, video_id)
1037 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1040 # TODO: Check which url parameters are required
1041 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1042 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1043 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1044 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1045 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1046 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1048 self.report_extraction(video_id)
1049 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1051 raise ExtractorError(u'Unable to extract video info')
1052 video_title = m_info.group('title')
1053 video_description = m_info.group('description')
1054 video_thumb = m_info.group('thumb')
1055 video_date = m_info.group('date')
1056 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1058 # TODO: Find a way to get mp4 videos
1059 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1060 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1061 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1062 video_url = m_rest.group('url')
1063 video_path = m_rest.group('path')
1065 raise ExtractorError(u'Unable to extract video url')
1067 else: # We have to use a different method if another id is defined
1068 long_id = m_id.group('new_id')
1069 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1070 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1071 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1072 info = json.loads(json_str)
1073 res = info[u'query'][u'results'][u'mediaObj'][0]
1074 stream = res[u'streams'][0]
1075 video_path = stream[u'path']
1076 video_url = stream[u'host']
1078 video_title = meta[u'title']
1079 video_description = meta[u'description']
1080 video_thumb = meta[u'thumbnail']
1081 video_date = None # I can't find it
1086 'play_path': video_path,
1087 'title':video_title,
1088 'description': video_description,
1089 'thumbnail': video_thumb,
1090 'upload_date': video_date,
1095 class VimeoIE(InfoExtractor):
1096 """Information extractor for vimeo.com."""
1098 # _VALID_URL matches Vimeo URLs
1099 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1102 def _real_extract(self, url, new_video=True):
1103 # Extract ID from URL
1104 mobj = re.match(self._VALID_URL, url)
1106 raise ExtractorError(u'Invalid URL: %s' % url)
1108 video_id = mobj.group('id')
1109 if not mobj.group('proto'):
1110 url = 'https://' + url
1111 if mobj.group('direct_link') or mobj.group('pro'):
1112 url = 'https://vimeo.com/' + video_id
1114 # Retrieve video webpage to extract further information
1115 request = compat_urllib_request.Request(url, None, std_headers)
1116 webpage = self._download_webpage(request, video_id)
1118 # Now we begin extracting as much information as we can from what we
1119 # retrieved. First we extract the information common to all extractors,
1120 # and latter we extract those that are Vimeo specific.
1121 self.report_extraction(video_id)
1123 # Extract the config JSON
1125 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1126 config = json.loads(config)
1128 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1129 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1131 raise ExtractorError(u'Unable to extract info section')
1134 video_title = config["video"]["title"]
1136 # Extract uploader and uploader_id
1137 video_uploader = config["video"]["owner"]["name"]
1138 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1140 # Extract video thumbnail
1141 video_thumbnail = config["video"]["thumbnail"]
1143 # Extract video description
1144 video_description = get_element_by_attribute("itemprop", "description", webpage)
1145 if video_description: video_description = clean_html(video_description)
1146 else: video_description = u''
1148 # Extract upload date
1149 video_upload_date = None
1150 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1151 if mobj is not None:
1152 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1154 # Vimeo specific: extract request signature and timestamp
1155 sig = config['request']['signature']
1156 timestamp = config['request']['timestamp']
1158 # Vimeo specific: extract video codec and quality information
1159 # First consider quality, then codecs, then take everything
1160 # TODO bind to format param
1161 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1162 files = { 'hd': [], 'sd': [], 'other': []}
1163 for codec_name, codec_extension in codecs:
1164 if codec_name in config["video"]["files"]:
1165 if 'hd' in config["video"]["files"][codec_name]:
1166 files['hd'].append((codec_name, codec_extension, 'hd'))
1167 elif 'sd' in config["video"]["files"][codec_name]:
1168 files['sd'].append((codec_name, codec_extension, 'sd'))
1170 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1172 for quality in ('hd', 'sd', 'other'):
1173 if len(files[quality]) > 0:
1174 video_quality = files[quality][0][2]
1175 video_codec = files[quality][0][0]
1176 video_extension = files[quality][0][1]
1177 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1180 raise ExtractorError(u'No known codec found')
1182 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1183 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1188 'uploader': video_uploader,
1189 'uploader_id': video_uploader_id,
1190 'upload_date': video_upload_date,
1191 'title': video_title,
1192 'ext': video_extension,
1193 'thumbnail': video_thumbnail,
1194 'description': video_description,
1198 class ArteTvIE(InfoExtractor):
1199 """arte.tv information extractor."""
1201 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1202 _LIVE_URL = r'index-[0-9]+\.html$'
1204 IE_NAME = u'arte.tv'
1206 def fetch_webpage(self, url):
1207 request = compat_urllib_request.Request(url)
1209 self.report_download_webpage(url)
1210 webpage = compat_urllib_request.urlopen(request).read()
1211 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1212 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1213 except ValueError as err:
1214 raise ExtractorError(u'Invalid URL: %s' % url)
1217 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1218 page = self.fetch_webpage(url)
1219 mobj = re.search(regex, page, regexFlags)
1223 raise ExtractorError(u'Invalid URL: %s' % url)
1225 for (i, key, err) in matchTuples:
1226 if mobj.group(i) is None:
1227 raise ExtractorError(err)
1229 info[key] = mobj.group(i)
1233 def extractLiveStream(self, url):
1234 video_lang = url.split('/')[-4]
1235 info = self.grep_webpage(
1237 r'src="(.*?/videothek_js.*?\.js)',
1240 (1, 'url', u'Invalid URL: %s' % url)
1243 http_host = url.split('/')[2]
1244 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1245 info = self.grep_webpage(
1247 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1248 '(http://.*?\.swf).*?' +
1252 (1, 'path', u'could not extract video path: %s' % url),
1253 (2, 'player', u'could not extract video player: %s' % url),
1254 (3, 'url', u'could not extract video url: %s' % url)
1257 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1259 def extractPlus7Stream(self, url):
1260 video_lang = url.split('/')[-3]
1261 info = self.grep_webpage(
1263 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1266 (1, 'url', u'Invalid URL: %s' % url)
1269 next_url = compat_urllib_parse.unquote(info.get('url'))
1270 info = self.grep_webpage(
1272 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1275 (1, 'url', u'Could not find <video> tag: %s' % url)
1278 next_url = compat_urllib_parse.unquote(info.get('url'))
1280 info = self.grep_webpage(
1282 r'<video id="(.*?)".*?>.*?' +
1283 '<name>(.*?)</name>.*?' +
1284 '<dateVideo>(.*?)</dateVideo>.*?' +
1285 '<url quality="hd">(.*?)</url>',
1288 (1, 'id', u'could not extract video id: %s' % url),
1289 (2, 'title', u'could not extract video title: %s' % url),
1290 (3, 'date', u'could not extract video date: %s' % url),
1291 (4, 'url', u'could not extract video url: %s' % url)
1296 'id': info.get('id'),
1297 'url': compat_urllib_parse.unquote(info.get('url')),
1298 'uploader': u'arte.tv',
1299 'upload_date': unified_strdate(info.get('date')),
1300 'title': info.get('title').decode('utf-8'),
1306 def _real_extract(self, url):
1307 video_id = url.split('/')[-1]
1308 self.report_extraction(video_id)
1310 if re.search(self._LIVE_URL, video_id) is not None:
1311 self.extractLiveStream(url)
1314 info = self.extractPlus7Stream(url)
1319 class GenericIE(InfoExtractor):
1320 """Generic last-resort information extractor."""
1323 IE_NAME = u'generic'
1325 def report_download_webpage(self, video_id):
1326 """Report webpage download."""
1327 if not self._downloader.params.get('test', False):
1328 self._downloader.report_warning(u'Falling back on generic information extractor.')
1329 super(GenericIE, self).report_download_webpage(video_id)
1331 def report_following_redirect(self, new_url):
1332 """Report information extraction."""
1333 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1335 def _test_redirect(self, url):
1336 """Check if it is a redirect, like url shorteners, in case return the new url."""
1337 class HeadRequest(compat_urllib_request.Request):
1338 def get_method(self):
1341 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1343 Subclass the HTTPRedirectHandler to make it use our
1344 HeadRequest also on the redirected URL
1346 def redirect_request(self, req, fp, code, msg, headers, newurl):
1347 if code in (301, 302, 303, 307):
1348 newurl = newurl.replace(' ', '%20')
1349 newheaders = dict((k,v) for k,v in req.headers.items()
1350 if k.lower() not in ("content-length", "content-type"))
1351 return HeadRequest(newurl,
1353 origin_req_host=req.get_origin_req_host(),
1356 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1358 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1360 Fallback to GET if HEAD is not allowed (405 HTTP error)
1362 def http_error_405(self, req, fp, code, msg, headers):
1366 newheaders = dict((k,v) for k,v in req.headers.items()
1367 if k.lower() not in ("content-length", "content-type"))
1368 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1370 origin_req_host=req.get_origin_req_host(),
1374 opener = compat_urllib_request.OpenerDirector()
1375 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1376 HTTPMethodFallback, HEADRedirectHandler,
1377 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1378 opener.add_handler(handler())
1380 response = opener.open(HeadRequest(url))
1381 if response is None:
1382 raise ExtractorError(u'Invalid URL protocol')
1383 new_url = response.geturl()
1388 self.report_following_redirect(new_url)
1391 def _real_extract(self, url):
1392 new_url = self._test_redirect(url)
1393 if new_url: return [self.url_result(new_url)]
1395 video_id = url.split('/')[-1]
1397 webpage = self._download_webpage(url, video_id)
1398 except ValueError as err:
1399 # since this is the last-resort InfoExtractor, if
1400 # this error is thrown, it'll be thrown here
1401 raise ExtractorError(u'Invalid URL: %s' % url)
1403 self.report_extraction(video_id)
1404 # Start with something easy: JW Player in SWFObject
1405 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1407 # Broaden the search a little bit
1408 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1410 # Broaden the search a little bit: JWPlayer JS loader
1411 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1413 raise ExtractorError(u'Invalid URL: %s' % url)
1415 # It's possible that one of the regexes
1416 # matched, but returned an empty group:
1417 if mobj.group(1) is None:
1418 raise ExtractorError(u'Invalid URL: %s' % url)
1420 video_url = compat_urllib_parse.unquote(mobj.group(1))
1421 video_id = os.path.basename(video_url)
1423 # here's a fun little line of code for you:
1424 video_extension = os.path.splitext(video_id)[1][1:]
1425 video_id = os.path.splitext(video_id)[0]
1427 # it's tempting to parse this further, but you would
1428 # have to take into account all the variations like
1429 # Video Title - Site Name
1430 # Site Name | Video Title
1431 # Video Title - Tagline | Site Name
1432 # and so on and so forth; it's just not practical
1433 mobj = re.search(r'<title>(.*)</title>', webpage)
1435 raise ExtractorError(u'Unable to extract title')
1436 video_title = mobj.group(1)
1438 # video uploader is domain name
1439 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1441 raise ExtractorError(u'Unable to extract title')
1442 video_uploader = mobj.group(1)
1447 'uploader': video_uploader,
1448 'upload_date': None,
1449 'title': video_title,
1450 'ext': video_extension,
1454 class YoutubeSearchIE(SearchInfoExtractor):
1455 """Information Extractor for YouTube search queries."""
1456 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1458 IE_NAME = u'youtube:search'
1459 _SEARCH_KEY = 'ytsearch'
1461 def report_download_page(self, query, pagenum):
1462 """Report attempt to download search page with given number."""
1463 query = query.decode(preferredencoding())
1464 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1466 def _get_n_results(self, query, n):
1467 """Get a specified number of results for a query"""
1473 while (50 * pagenum) < limit:
1474 self.report_download_page(query, pagenum+1)
1475 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1476 request = compat_urllib_request.Request(result_url)
1478 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1481 api_response = json.loads(data)['data']
1483 if not 'items' in api_response:
1484 raise ExtractorError(u'[youtube] No video results')
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1489 limit = min(n, api_response['totalItems'])
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
1494 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1495 return self.playlist_result(videos, query)
1498 class GoogleSearchIE(SearchInfoExtractor):
1499 """Information Extractor for Google Video search queries."""
1500 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1502 IE_NAME = u'video.google:search'
1503 _SEARCH_KEY = 'gvsearch'
1505 def _get_n_results(self, query, n):
1506 """Get a specified number of results for a query"""
1509 '_type': 'playlist',
1514 for pagenum in itertools.count(1):
1515 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1516 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1517 note='Downloading result page ' + str(pagenum))
1519 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1522 'url': mobj.group(1)
1524 res['entries'].append(e)
1526 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1529 class YahooSearchIE(SearchInfoExtractor):
1530 """Information Extractor for Yahoo! Video search queries."""
1533 IE_NAME = u'screen.yahoo:search'
1534 _SEARCH_KEY = 'yvsearch'
1536 def _get_n_results(self, query, n):
1537 """Get a specified number of results for a query"""
1540 '_type': 'playlist',
1544 for pagenum in itertools.count(0):
1545 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1546 webpage = self._download_webpage(result_url, query,
1547 note='Downloading results page '+str(pagenum+1))
1548 info = json.loads(webpage)
1550 results = info[u'results']
1552 for (i, r) in enumerate(results):
1553 if (pagenum * 30) +i >= n:
1555 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1556 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1557 res['entries'].append(e)
1558 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1564 class YoutubePlaylistIE(InfoExtractor):
1565 """Information Extractor for YouTube playlists."""
1567 _VALID_URL = r"""(?:
1572 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1573 \? (?:.*?&)*? (?:p|a|list)=
1576 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1579 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1581 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1583 IE_NAME = u'youtube:playlist'
1586 def suitable(cls, url):
1587 """Receives a URL and returns True if suitable for this IE."""
1588 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1590 def _real_extract(self, url):
1591 # Extract playlist id
1592 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1594 raise ExtractorError(u'Invalid URL: %s' % url)
1596 # Download playlist videos from API
1597 playlist_id = mobj.group(1) or mobj.group(2)
1602 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1603 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1606 response = json.loads(page)
1607 except ValueError as err:
1608 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1610 if 'feed' not in response:
1611 raise ExtractorError(u'Got a malformed response from YouTube API')
1612 playlist_title = response['feed']['title']['$t']
1613 if 'entry' not in response['feed']:
1614 # Number of videos is a multiple of self._MAX_RESULTS
1617 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1618 for entry in response['feed']['entry']
1619 if 'content' in entry ]
1621 if len(response['feed']['entry']) < self._MAX_RESULTS:
1625 videos = [v[1] for v in sorted(videos)]
1627 url_results = [self.url_result(url, 'Youtube') for url in videos]
1628 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1631 class YoutubeChannelIE(InfoExtractor):
1632 """Information Extractor for YouTube channels."""
1634 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1635 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1636 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1637 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1638 IE_NAME = u'youtube:channel'
1640 def extract_videos_from_page(self, page):
1642 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1643 if mobj.group(1) not in ids_in_page:
1644 ids_in_page.append(mobj.group(1))
1647 def _real_extract(self, url):
1648 # Extract channel id
1649 mobj = re.match(self._VALID_URL, url)
1651 raise ExtractorError(u'Invalid URL: %s' % url)
1653 # Download channel page
1654 channel_id = mobj.group(1)
1658 url = self._TEMPLATE_URL % (channel_id, pagenum)
1659 page = self._download_webpage(url, channel_id,
1660 u'Downloading page #%s' % pagenum)
1662 # Extract video identifiers
1663 ids_in_page = self.extract_videos_from_page(page)
1664 video_ids.extend(ids_in_page)
1666 # Download any subsequent channel pages using the json-based channel_ajax query
1667 if self._MORE_PAGES_INDICATOR in page:
1669 pagenum = pagenum + 1
1671 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1672 page = self._download_webpage(url, channel_id,
1673 u'Downloading page #%s' % pagenum)
1675 page = json.loads(page)
1677 ids_in_page = self.extract_videos_from_page(page['content_html'])
1678 video_ids.extend(ids_in_page)
1680 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1683 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1685 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1686 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1687 return [self.playlist_result(url_entries, channel_id)]
1690 class YoutubeUserIE(InfoExtractor):
1691 """Information Extractor for YouTube users."""
1693 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1694 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1695 _GDATA_PAGE_SIZE = 50
1696 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1697 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1698 IE_NAME = u'youtube:user'
1700 def _real_extract(self, url):
1702 mobj = re.match(self._VALID_URL, url)
1704 raise ExtractorError(u'Invalid URL: %s' % url)
1706 username = mobj.group(1)
1708 # Download video ids using YouTube Data API. Result size per
1709 # query is limited (currently to 50 videos) so we need to query
1710 # page by page until there are no video ids - it means we got
1717 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1719 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1720 page = self._download_webpage(gdata_url, username,
1721 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1723 # Extract video identifiers
1726 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1727 if mobj.group(1) not in ids_in_page:
1728 ids_in_page.append(mobj.group(1))
1730 video_ids.extend(ids_in_page)
1732 # A little optimization - if current page is not
1733 # "full", ie. does not contain PAGE_SIZE video ids then
1734 # we can assume that this page is the last one - there
1735 # are no more ids on further pages - no need to query
1738 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1743 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1744 url_results = [self.url_result(url, 'Youtube') for url in urls]
1745 return [self.playlist_result(url_results, playlist_title = username)]
1748 class BlipTVUserIE(InfoExtractor):
1749 """Information Extractor for blip.tv users."""
1751 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1753 IE_NAME = u'blip.tv:user'
1755 def _real_extract(self, url):
1757 mobj = re.match(self._VALID_URL, url)
1759 raise ExtractorError(u'Invalid URL: %s' % url)
1761 username = mobj.group(1)
1763 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1765 page = self._download_webpage(url, username, u'Downloading user page')
1766 mobj = re.search(r'data-users-id="([^"]+)"', page)
1767 page_base = page_base % mobj.group(1)
1770 # Download video ids using BlipTV Ajax calls. Result size per
1771 # query is limited (currently to 12 videos) so we need to query
1772 # page by page until there are no video ids - it means we got
1779 url = page_base + "&page=" + str(pagenum)
1780 page = self._download_webpage(url, username,
1781 u'Downloading video ids from page %d' % pagenum)
1783 # Extract video identifiers
1786 for mobj in re.finditer(r'href="/([^"]+)"', page):
1787 if mobj.group(1) not in ids_in_page:
1788 ids_in_page.append(unescapeHTML(mobj.group(1)))
1790 video_ids.extend(ids_in_page)
1792 # A little optimization - if current page is not
1793 # "full", ie. does not contain PAGE_SIZE video ids then
1794 # we can assume that this page is the last one - there
1795 # are no more ids on further pages - no need to query
1798 if len(ids_in_page) < self._PAGE_SIZE:
1803 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1804 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1805 return [self.playlist_result(url_entries, playlist_title = username)]
1808 class DepositFilesIE(InfoExtractor):
1809 """Information extractor for depositfiles.com"""
1811 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1813 def _real_extract(self, url):
1814 file_id = url.split('/')[-1]
1815 # Rebuild url in english locale
1816 url = 'http://depositfiles.com/en/files/' + file_id
1818 # Retrieve file webpage with 'Free download' button pressed
1819 free_download_indication = { 'gateway_result' : '1' }
1820 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1822 self.report_download_webpage(file_id)
1823 webpage = compat_urllib_request.urlopen(request).read()
1824 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1825 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1827 # Search for the real file URL
1828 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1829 if (mobj is None) or (mobj.group(1) is None):
1830 # Try to figure out reason of the error.
1831 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1832 if (mobj is not None) and (mobj.group(1) is not None):
1833 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1834 raise ExtractorError(u'%s' % restriction_message)
1836 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1838 file_url = mobj.group(1)
1839 file_extension = os.path.splitext(file_url)[1][1:]
1841 # Search for file title
1842 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1845 'id': file_id.decode('utf-8'),
1846 'url': file_url.decode('utf-8'),
1848 'upload_date': None,
1849 'title': file_title,
1850 'ext': file_extension.decode('utf-8'),
1854 class FacebookIE(InfoExtractor):
1855 """Information Extractor for Facebook"""
1857 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1858 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1859 _NETRC_MACHINE = 'facebook'
1860 IE_NAME = u'facebook'
1862 def report_login(self):
1863 """Report attempt to log in."""
1864 self.to_screen(u'Logging in')
1866 def _real_initialize(self):
1867 if self._downloader is None:
1872 downloader_params = self._downloader.params
1874 # Attempt to use provided username and password or .netrc data
1875 if downloader_params.get('username', None) is not None:
1876 useremail = downloader_params['username']
1877 password = downloader_params['password']
1878 elif downloader_params.get('usenetrc', False):
1880 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1881 if info is not None:
1885 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1886 except (IOError, netrc.NetrcParseError) as err:
1887 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1890 if useremail is None:
1899 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1902 login_results = compat_urllib_request.urlopen(request).read()
1903 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1904 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1906 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1910 def _real_extract(self, url):
1911 mobj = re.match(self._VALID_URL, url)
1913 raise ExtractorError(u'Invalid URL: %s' % url)
1914 video_id = mobj.group('ID')
1916 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1917 webpage = self._download_webpage(url, video_id)
1919 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1920 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1921 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1923 raise ExtractorError(u'Cannot parse data')
1924 data = dict(json.loads(m.group(1)))
1925 params_raw = compat_urllib_parse.unquote(data['params'])
1926 params = json.loads(params_raw)
1927 video_data = params['video_data'][0]
1928 video_url = video_data.get('hd_src')
1930 video_url = video_data['sd_src']
1932 raise ExtractorError(u'Cannot find video URL')
1933 video_duration = int(video_data['video_duration'])
1934 thumbnail = video_data['thumbnail_src']
1936 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1941 'title': video_title,
1944 'duration': video_duration,
1945 'thumbnail': thumbnail,
1950 class BlipTVIE(InfoExtractor):
1951 """Information extractor for blip.tv"""
1953 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1954 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1955 IE_NAME = u'blip.tv'
1957 def report_direct_download(self, title):
1958 """Report information extraction."""
1959 self.to_screen(u'%s: Direct download detected' % title)
1961 def _real_extract(self, url):
1962 mobj = re.match(self._VALID_URL, url)
1964 raise ExtractorError(u'Invalid URL: %s' % url)
1966 # See https://github.com/rg3/youtube-dl/issues/857
1967 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1968 if api_mobj is not None:
1969 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1970 urlp = compat_urllib_parse_urlparse(url)
1971 if urlp.path.startswith('/play/'):
1972 request = compat_urllib_request.Request(url)
1973 response = compat_urllib_request.urlopen(request)
1974 redirecturl = response.geturl()
1975 rurlp = compat_urllib_parse_urlparse(redirecturl)
1976 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1977 url = 'http://blip.tv/a/a-' + file_id
1978 return self._real_extract(url)
1985 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1986 request = compat_urllib_request.Request(json_url)
1987 request.add_header('User-Agent', 'iTunes/10.6.1')
1988 self.report_extraction(mobj.group(1))
1991 urlh = compat_urllib_request.urlopen(request)
1992 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1993 basename = url.split('/')[-1]
1994 title,ext = os.path.splitext(basename)
1995 title = title.decode('UTF-8')
1996 ext = ext.replace('.', '')
1997 self.report_direct_download(title)
2002 'upload_date': None,
2007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2008 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2009 if info is None: # Regular URL
2011 json_code_bytes = urlh.read()
2012 json_code = json_code_bytes.decode('utf-8')
2013 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2014 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2017 json_data = json.loads(json_code)
2018 if 'Post' in json_data:
2019 data = json_data['Post']
2023 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2024 video_url = data['media']['url']
2025 umobj = re.match(self._URL_EXT, video_url)
2027 raise ValueError('Can not determine filename extension')
2028 ext = umobj.group(1)
2031 'id': data['item_id'],
2033 'uploader': data['display_name'],
2034 'upload_date': upload_date,
2035 'title': data['title'],
2037 'format': data['media']['mimeType'],
2038 'thumbnail': data['thumbnailUrl'],
2039 'description': data['description'],
2040 'player_url': data['embedUrl'],
2041 'user_agent': 'iTunes/10.6.1',
2043 except (ValueError,KeyError) as err:
2044 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2049 class MyVideoIE(InfoExtractor):
2050 """Information Extractor for myvideo.de."""
2052 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2053 IE_NAME = u'myvideo'
2055 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2056 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2057 # https://github.com/rg3/youtube-dl/pull/842
2058 def __rc4crypt(self,data, key):
2060 box = list(range(256))
2061 for i in list(range(256)):
2062 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2063 box[i], box[x] = box[x], box[i]
2069 y = (y + box[x]) % 256
2070 box[x], box[y] = box[y], box[x]
2071 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2075 return hashlib.md5(s).hexdigest().encode()
2077 def _real_extract(self,url):
2078 mobj = re.match(self._VALID_URL, url)
2080 raise ExtractorError(u'invalid URL: %s' % url)
2082 video_id = mobj.group(1)
2085 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2086 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2087 b'TnpsbA0KTVRkbU1tSTRNdz09'
2091 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2092 webpage = self._download_webpage(webpage_url, video_id)
2094 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2095 if mobj is not None:
2096 self.report_extraction(video_id)
2097 video_url = mobj.group(1) + '.flv'
2099 video_title = self._html_search_regex('<title>([^<]+)</title>',
2102 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2108 'upload_date': None,
2109 'title': video_title,
2114 mobj = re.search('var flashvars={(.+?)}', webpage)
2116 raise ExtractorError(u'Unable to extract video')
2121 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2122 if not a == '_encxml':
2125 encxml = compat_urllib_parse.unquote(b)
2126 if not params.get('domain'):
2127 params['domain'] = 'www.myvideo.de'
2128 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2129 if 'flash_playertype=MTV' in xmldata_url:
2130 self._downloader.report_warning(u'avoiding MTV player')
2132 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2133 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2137 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2138 enc_data_b = binascii.unhexlify(enc_data)
2140 base64.b64decode(base64.b64decode(GK)) +
2142 str(video_id).encode('utf-8')
2145 dec_data = self.__rc4crypt(enc_data_b, sk)
2148 self.report_extraction(video_id)
2151 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2153 video_url = compat_urllib_parse.unquote(mobj.group(1))
2154 if 'myvideo2flash' in video_url:
2155 self._downloader.report_warning(u'forcing RTMPT ...')
2156 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2159 # extract non rtmp videos
2160 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2162 raise ExtractorError(u'unable to extract url')
2163 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2165 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2166 video_file = compat_urllib_parse.unquote(video_file)
2168 if not video_file.endswith('f4m'):
2169 ppath, prefix = video_file.split('.')
2170 video_playpath = '%s:%s' % (prefix, ppath)
2171 video_hls_playlist = ''
2174 video_hls_playlist = (
2175 video_filepath + video_file
2176 ).replace('.f4m', '.m3u8')
2178 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2179 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2181 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2187 'tc_url': video_url,
2189 'upload_date': None,
2190 'title': video_title,
2192 'play_path': video_playpath,
2193 'video_file': video_file,
2194 'video_hls_playlist': video_hls_playlist,
2195 'player_url': video_swfobj,
2199 class ComedyCentralIE(InfoExtractor):
2200 """Information extractor for The Daily Show and Colbert Report """
2202 # urls can be abbreviations like :thedailyshow or :colbert
2203 # urls for episodes like:
2204 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2205 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2206 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2207 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2208 |(https?://)?(www\.)?
2209 (?P<showname>thedailyshow|colbertnation)\.com/
2210 (full-episodes/(?P<episode>.*)|
2212 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2213 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2216 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2218 _video_extensions = {
2226 _video_dimensions = {
2236 def suitable(cls, url):
2237 """Receives a URL and returns True if suitable for this IE."""
2238 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2240 def _print_formats(self, formats):
2241 print('Available formats:')
2243 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2246 def _real_extract(self, url):
2247 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249 raise ExtractorError(u'Invalid URL: %s' % url)
2251 if mobj.group('shortname'):
2252 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2253 url = u'http://www.thedailyshow.com/full-episodes/'
2255 url = u'http://www.colbertnation.com/full-episodes/'
2256 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2257 assert mobj is not None
2259 if mobj.group('clip'):
2260 if mobj.group('showname') == 'thedailyshow':
2261 epTitle = mobj.group('tdstitle')
2263 epTitle = mobj.group('cntitle')
2266 dlNewest = not mobj.group('episode')
2268 epTitle = mobj.group('showname')
2270 epTitle = mobj.group('episode')
2272 self.report_extraction(epTitle)
2273 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2275 url = htmlHandle.geturl()
2276 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2278 raise ExtractorError(u'Invalid redirected URL: ' + url)
2279 if mobj.group('episode') == '':
2280 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2281 epTitle = mobj.group('episode')
2283 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2285 if len(mMovieParams) == 0:
2286 # The Colbert Report embeds the information in a without
2287 # a URL prefix; so extract the alternate reference
2288 # and then add the URL prefix manually.
2290 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2291 if len(altMovieParams) == 0:
2292 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2294 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2296 uri = mMovieParams[0][1]
2297 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2298 indexXml = self._download_webpage(indexUrl, epTitle,
2299 u'Downloading show index',
2300 u'unable to download episode index')
2304 idoc = xml.etree.ElementTree.fromstring(indexXml)
2305 itemEls = idoc.findall('.//item')
2306 for partNum,itemEl in enumerate(itemEls):
2307 mediaId = itemEl.findall('./guid')[0].text
2308 shortMediaId = mediaId.split(':')[-1]
2309 showId = mediaId.split(':')[-2].replace('.com', '')
2310 officialTitle = itemEl.findall('./title')[0].text
2311 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2313 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2314 compat_urllib_parse.urlencode({'uri': mediaId}))
2315 configXml = self._download_webpage(configUrl, epTitle,
2316 u'Downloading configuration for %s' % shortMediaId)
2318 cdoc = xml.etree.ElementTree.fromstring(configXml)
2320 for rendition in cdoc.findall('.//rendition'):
2321 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2325 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2328 if self._downloader.params.get('listformats', None):
2329 self._print_formats([i[0] for i in turls])
2332 # For now, just pick the highest bitrate
2333 format,rtmp_video_url = turls[-1]
2335 # Get the format arg from the arg stream
2336 req_format = self._downloader.params.get('format', None)
2338 # Select format if we can find one
2341 format, rtmp_video_url = f, v
2344 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2346 raise ExtractorError(u'Cannot transform RTMP url')
2347 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2348 video_url = base + m.group('finalid')
2350 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2355 'upload_date': officialDate,
2360 'description': officialTitle,
2362 results.append(info)
2367 class EscapistIE(InfoExtractor):
2368 """Information extractor for The Escapist """
2370 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2371 IE_NAME = u'escapist'
2373 def _real_extract(self, url):
2374 mobj = re.match(self._VALID_URL, url)
2376 raise ExtractorError(u'Invalid URL: %s' % url)
2377 showName = mobj.group('showname')
2378 videoId = mobj.group('episode')
2380 self.report_extraction(videoId)
2381 webpage = self._download_webpage(url, videoId)
2383 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2384 webpage, u'description', fatal=False)
2386 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2387 webpage, u'thumbnail', fatal=False)
2389 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2390 webpage, u'player url')
2392 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2393 webpage, u'player url').split(' : ')[-1]
2395 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2396 configUrl = compat_urllib_parse.unquote(configUrl)
2398 configJSON = self._download_webpage(configUrl, videoId,
2399 u'Downloading configuration',
2400 u'unable to download configuration')
2402 # Technically, it's JavaScript, not JSON
2403 configJSON = configJSON.replace("'", '"')
2406 config = json.loads(configJSON)
2407 except (ValueError,) as err:
2408 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2410 playlist = config['playlist']
2411 videoUrl = playlist[1]['url']
2416 'uploader': showName,
2417 'upload_date': None,
2420 'thumbnail': imgUrl,
2421 'description': videoDesc,
2422 'player_url': playerUrl,
2427 class CollegeHumorIE(InfoExtractor):
2428 """Information extractor for collegehumor.com"""
2431 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2432 IE_NAME = u'collegehumor'
2434 def report_manifest(self, video_id):
2435 """Report information extraction."""
2436 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2438 def _real_extract(self, url):
2439 mobj = re.match(self._VALID_URL, url)
2441 raise ExtractorError(u'Invalid URL: %s' % url)
2442 video_id = mobj.group('videoid')
2447 'upload_date': None,
2450 self.report_extraction(video_id)
2451 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2453 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2457 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2459 videoNode = mdoc.findall('./video')[0]
2460 info['description'] = videoNode.findall('./description')[0].text
2461 info['title'] = videoNode.findall('./caption')[0].text
2462 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2463 manifest_url = videoNode.findall('./file')[0].text
2465 raise ExtractorError(u'Invalid metadata XML file')
2467 manifest_url += '?hdcore=2.10.3'
2468 self.report_manifest(video_id)
2470 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2471 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2472 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2474 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2476 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2477 node_id = media_node.attrib['url']
2478 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2479 except IndexError as err:
2480 raise ExtractorError(u'Invalid manifest file')
2482 url_pr = compat_urllib_parse_urlparse(manifest_url)
2483 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2490 class XVideosIE(InfoExtractor):
2491 """Information extractor for xvideos.com"""
2493 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2494 IE_NAME = u'xvideos'
2496 def _real_extract(self, url):
2497 mobj = re.match(self._VALID_URL, url)
2499 raise ExtractorError(u'Invalid URL: %s' % url)
2500 video_id = mobj.group(1)
2502 webpage = self._download_webpage(url, video_id)
2504 self.report_extraction(video_id)
2507 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2508 webpage, u'video URL'))
2511 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2514 # Extract video thumbnail
2515 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2516 webpage, u'thumbnail', fatal=False)
2522 'upload_date': None,
2523 'title': video_title,
2525 'thumbnail': video_thumbnail,
2526 'description': None,
2532 class SoundcloudIE(InfoExtractor):
2533 """Information extractor for soundcloud.com
2534 To access the media, the uid of the song and a stream token
2535 must be extracted from the page source and the script must make
2536 a request to media.soundcloud.com/crossdomain.xml. Then
2537 the media can be grabbed by requesting from an url composed
2538 of the stream token and uid
2541 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2542 IE_NAME = u'soundcloud'
2544 def report_resolve(self, video_id):
2545 """Report information extraction."""
2546 self.to_screen(u'%s: Resolving id' % video_id)
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2551 raise ExtractorError(u'Invalid URL: %s' % url)
2553 # extract uploader (which is in the url)
2554 uploader = mobj.group(1)
2555 # extract simple title (uploader + slug of song title)
2556 slug_title = mobj.group(2)
2557 simple_title = uploader + u'-' + slug_title
2558 full_title = '%s/%s' % (uploader, slug_title)
2560 self.report_resolve(full_title)
2562 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2563 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2564 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2566 info = json.loads(info_json)
2567 video_id = info['id']
2568 self.report_extraction(full_title)
2570 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2571 stream_json = self._download_webpage(streams_url, full_title,
2572 u'Downloading stream definitions',
2573 u'unable to download stream definitions')
2575 streams = json.loads(stream_json)
2576 mediaURL = streams['http_mp3_128_url']
2577 upload_date = unified_strdate(info['created_at'])
2582 'uploader': info['user']['username'],
2583 'upload_date': upload_date,
2584 'title': info['title'],
2586 'description': info['description'],
2589 class SoundcloudSetIE(InfoExtractor):
2590 """Information extractor for soundcloud.com sets
2591 To access the media, the uid of the song and a stream token
2592 must be extracted from the page source and the script must make
2593 a request to media.soundcloud.com/crossdomain.xml. Then
2594 the media can be grabbed by requesting from an url composed
2595 of the stream token and uid
2598 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2599 IE_NAME = u'soundcloud:set'
2601 def report_resolve(self, video_id):
2602 """Report information extraction."""
2603 self.to_screen(u'%s: Resolving id' % video_id)
2605 def _real_extract(self, url):
2606 mobj = re.match(self._VALID_URL, url)
2608 raise ExtractorError(u'Invalid URL: %s' % url)
2610 # extract uploader (which is in the url)
2611 uploader = mobj.group(1)
2612 # extract simple title (uploader + slug of song title)
2613 slug_title = mobj.group(2)
2614 simple_title = uploader + u'-' + slug_title
2615 full_title = '%s/sets/%s' % (uploader, slug_title)
2617 self.report_resolve(full_title)
2619 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2620 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2621 info_json = self._download_webpage(resolv_url, full_title)
2624 info = json.loads(info_json)
2625 if 'errors' in info:
2626 for err in info['errors']:
2627 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2630 self.report_extraction(full_title)
2631 for track in info['tracks']:
2632 video_id = track['id']
2634 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2635 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2637 self.report_extraction(video_id)
2638 streams = json.loads(stream_json)
2639 mediaURL = streams['http_mp3_128_url']
2644 'uploader': track['user']['username'],
2645 'upload_date': unified_strdate(track['created_at']),
2646 'title': track['title'],
2648 'description': track['description'],
2653 class InfoQIE(InfoExtractor):
2654 """Information extractor for infoq.com"""
2655 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2660 raise ExtractorError(u'Invalid URL: %s' % url)
2662 webpage = self._download_webpage(url, video_id=url)
2663 self.report_extraction(url)
2666 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2668 raise ExtractorError(u'Unable to extract video url')
2669 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2670 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2673 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2676 # Extract description
2677 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2678 webpage, u'description', fatal=False)
2680 video_filename = video_url.split('/')[-1]
2681 video_id, extension = video_filename.split('.')
2687 'upload_date': None,
2688 'title': video_title,
2689 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2691 'description': video_description,
2696 class MixcloudIE(InfoExtractor):
2697 """Information extractor for www.mixcloud.com"""
2699 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2700 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2701 IE_NAME = u'mixcloud'
2703 def report_download_json(self, file_id):
2704 """Report JSON download."""
2705 self.to_screen(u'Downloading json')
2707 def get_urls(self, jsonData, fmt, bitrate='best'):
2708 """Get urls from 'audio_formats' section in json"""
2711 bitrate_list = jsonData[fmt]
2712 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2713 bitrate = max(bitrate_list) # select highest
2715 url_list = jsonData[fmt][bitrate]
2716 except TypeError: # we have no bitrate info.
2717 url_list = jsonData[fmt]
2720 def check_urls(self, url_list):
2721 """Returns 1st active url from list"""
2722 for url in url_list:
2724 compat_urllib_request.urlopen(url)
2726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2731 def _print_formats(self, formats):
2732 print('Available formats:')
2733 for fmt in formats.keys():
2734 for b in formats[fmt]:
2736 ext = formats[fmt][b][0]
2737 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2738 except TypeError: # we have no bitrate info
2739 ext = formats[fmt][0]
2740 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2743 def _real_extract(self, url):
2744 mobj = re.match(self._VALID_URL, url)
2746 raise ExtractorError(u'Invalid URL: %s' % url)
2747 # extract uploader & filename from url
2748 uploader = mobj.group(1).decode('utf-8')
2749 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2751 # construct API request
2752 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2753 # retrieve .json file with links to files
2754 request = compat_urllib_request.Request(file_url)
2756 self.report_download_json(file_url)
2757 jsonData = compat_urllib_request.urlopen(request).read()
2758 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2759 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2762 json_data = json.loads(jsonData)
2763 player_url = json_data['player_swf_url']
2764 formats = dict(json_data['audio_formats'])
2766 req_format = self._downloader.params.get('format', None)
2769 if self._downloader.params.get('listformats', None):
2770 self._print_formats(formats)
2773 if req_format is None or req_format == 'best':
2774 for format_param in formats.keys():
2775 url_list = self.get_urls(formats, format_param)
2777 file_url = self.check_urls(url_list)
2778 if file_url is not None:
2781 if req_format not in formats:
2782 raise ExtractorError(u'Format is not available')
2784 url_list = self.get_urls(formats, req_format)
2785 file_url = self.check_urls(url_list)
2786 format_param = req_format
2789 'id': file_id.decode('utf-8'),
2790 'url': file_url.decode('utf-8'),
2791 'uploader': uploader.decode('utf-8'),
2792 'upload_date': None,
2793 'title': json_data['name'],
2794 'ext': file_url.split('.')[-1].decode('utf-8'),
2795 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2796 'thumbnail': json_data['thumbnail_url'],
2797 'description': json_data['description'],
2798 'player_url': player_url.decode('utf-8'),
2801 class StanfordOpenClassroomIE(InfoExtractor):
2802 """Information extractor for Stanford's Open ClassRoom"""
2804 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2805 IE_NAME = u'stanfordoc'
2807 def _real_extract(self, url):
2808 mobj = re.match(self._VALID_URL, url)
2810 raise ExtractorError(u'Invalid URL: %s' % url)
2812 if mobj.group('course') and mobj.group('video'): # A specific video
2813 course = mobj.group('course')
2814 video = mobj.group('video')
2816 'id': course + '_' + video,
2818 'upload_date': None,
2821 self.report_extraction(info['id'])
2822 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2823 xmlUrl = baseUrl + video + '.xml'
2825 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2826 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2827 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2828 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2830 info['title'] = mdoc.findall('./title')[0].text
2831 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2833 raise ExtractorError(u'Invalid metadata XML file')
2834 info['ext'] = info['url'].rpartition('.')[2]
2836 elif mobj.group('course'): # A course page
2837 course = mobj.group('course')
2842 'upload_date': None,
2845 coursepage = self._download_webpage(url, info['id'],
2846 note='Downloading course info page',
2847 errnote='Unable to download course info page')
2849 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2851 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2852 coursepage, u'description', fatal=False)
2854 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2857 'type': 'reference',
2858 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2862 for entry in info['list']:
2863 assert entry['type'] == 'reference'
2864 results += self.extract(entry['url'])
2868 'id': 'Stanford OpenClassroom',
2871 'upload_date': None,
2874 self.report_download_webpage(info['id'])
2875 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2877 rootpage = compat_urllib_request.urlopen(rootURL).read()
2878 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2879 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2881 info['title'] = info['id']
2883 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2886 'type': 'reference',
2887 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2892 for entry in info['list']:
2893 assert entry['type'] == 'reference'
2894 results += self.extract(entry['url'])
2897 class MTVIE(InfoExtractor):
2898 """Information extractor for MTV.com"""
2900 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2903 def _real_extract(self, url):
2904 mobj = re.match(self._VALID_URL, url)
2906 raise ExtractorError(u'Invalid URL: %s' % url)
2907 if not mobj.group('proto'):
2908 url = 'http://' + url
2909 video_id = mobj.group('videoid')
2911 webpage = self._download_webpage(url, video_id)
2913 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2914 webpage, u'song name', fatal=False)
2916 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2919 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2920 webpage, u'mtvn_uri', fatal=False)
2922 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2923 webpage, u'content id', fatal=False)
2925 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2926 self.report_extraction(video_id)
2927 request = compat_urllib_request.Request(videogen_url)
2929 metadataXml = compat_urllib_request.urlopen(request).read()
2930 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2931 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2933 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2934 renditions = mdoc.findall('.//rendition')
2936 # For now, always pick the highest quality.
2937 rendition = renditions[-1]
2940 _,_,ext = rendition.attrib['type'].partition('/')
2941 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2942 video_url = rendition.find('./src').text
2944 raise ExtractorError('Invalid rendition field.')
2949 'uploader': performer,
2950 'upload_date': None,
2951 'title': video_title,
2959 class YoukuIE(InfoExtractor):
2960 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2963 nowTime = int(time.time() * 1000)
2964 random1 = random.randint(1000,1998)
2965 random2 = random.randint(1000,9999)
2967 return "%d%d%d" %(nowTime,random1,random2)
2969 def _get_file_ID_mix_string(self, seed):
2971 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2973 for i in range(len(source)):
2974 seed = (seed * 211 + 30031 ) % 65536
2975 index = math.floor(seed / 65536 * len(source) )
2976 mixed.append(source[int(index)])
2977 source.remove(source[int(index)])
2978 #return ''.join(mixed)
2981 def _get_file_id(self, fileId, seed):
2982 mixed = self._get_file_ID_mix_string(seed)
2983 ids = fileId.split('*')
2987 realId.append(mixed[int(ch)])
2988 return ''.join(realId)
2990 def _real_extract(self, url):
2991 mobj = re.match(self._VALID_URL, url)
2993 raise ExtractorError(u'Invalid URL: %s' % url)
2994 video_id = mobj.group('ID')
2996 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2998 jsondata = self._download_webpage(info_url, video_id)
3000 self.report_extraction(video_id)
3002 config = json.loads(jsondata)
3004 video_title = config['data'][0]['title']
3005 seed = config['data'][0]['seed']
3007 format = self._downloader.params.get('format', None)
3008 supported_format = list(config['data'][0]['streamfileids'].keys())
3010 if format is None or format == 'best':
3011 if 'hd2' in supported_format:
3016 elif format == 'worst':
3024 fileid = config['data'][0]['streamfileids'][format]
3025 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3026 except (UnicodeDecodeError, ValueError, KeyError):
3027 raise ExtractorError(u'Unable to extract info section')
3030 sid = self._gen_sid()
3031 fileid = self._get_file_id(fileid, seed)
3033 #column 8,9 of fileid represent the segment number
3034 #fileid[7:9] should be changed
3035 for index, key in enumerate(keys):
3037 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3038 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3041 'id': '%s_part%02d' % (video_id, index),
3042 'url': download_url,
3044 'upload_date': None,
3045 'title': video_title,
3048 files_info.append(info)
3053 class XNXXIE(InfoExtractor):
3054 """Information extractor for xnxx.com"""
3056 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3058 VIDEO_URL_RE = r'flv_url=(.*?)&'
3059 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3060 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3062 def _real_extract(self, url):
3063 mobj = re.match(self._VALID_URL, url)
3065 raise ExtractorError(u'Invalid URL: %s' % url)
3066 video_id = mobj.group(1)
3068 # Get webpage content
3069 webpage = self._download_webpage(url, video_id)
3071 video_url = self._search_regex(self.VIDEO_URL_RE,
3072 webpage, u'video URL')
3073 video_url = compat_urllib_parse.unquote(video_url)
3075 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3078 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3079 webpage, u'thumbnail', fatal=False)
3085 'upload_date': None,
3086 'title': video_title,
3088 'thumbnail': video_thumbnail,
3089 'description': None,
3093 class GooglePlusIE(InfoExtractor):
3094 """Information extractor for plus.google.com."""
3096 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3097 IE_NAME = u'plus.google'
3099 def _real_extract(self, url):
3100 # Extract id from URL
3101 mobj = re.match(self._VALID_URL, url)
3103 raise ExtractorError(u'Invalid URL: %s' % url)
3105 post_url = mobj.group(0)
3106 video_id = mobj.group(1)
3108 video_extension = 'flv'
3110 # Step 1, Retrieve post webpage to extract further information
3111 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3113 self.report_extraction(video_id)
3115 # Extract update date
3116 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3117 webpage, u'upload date', fatal=False)
3119 # Convert timestring to a format suitable for filename
3120 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3121 upload_date = upload_date.strftime('%Y%m%d')
3124 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3125 webpage, u'uploader', fatal=False)
3128 # Get the first line for title
3129 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3130 webpage, 'title', default=u'NA')
3132 # Step 2, Stimulate clicking the image box to launch video
3133 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3134 webpage, u'video page URL')
3135 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3137 # Extract video links on video page
3138 """Extract video links of all sizes"""
3139 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3140 mobj = re.findall(pattern, webpage)
3142 raise ExtractorError(u'Unable to extract video links')
3144 # Sort in resolution
3145 links = sorted(mobj)
3147 # Choose the lowest of the sort, i.e. highest resolution
3148 video_url = links[-1]
3149 # Only get the url. The resolution part in the tuple has no use anymore
3150 video_url = video_url[-1]
3151 # Treat escaped \u0026 style hex
3153 video_url = video_url.decode("unicode_escape")
3154 except AttributeError: # Python 3
3155 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3161 'uploader': uploader,
3162 'upload_date': upload_date,
3163 'title': video_title,
3164 'ext': video_extension,
3167 class NBAIE(InfoExtractor):
3168 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3171 def _real_extract(self, url):
3172 mobj = re.match(self._VALID_URL, url)
3174 raise ExtractorError(u'Invalid URL: %s' % url)
3176 video_id = mobj.group(1)
3178 webpage = self._download_webpage(url, video_id)
3180 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3182 shortened_video_id = video_id.rpartition('/')[2]
3183 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3184 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3186 # It isn't there in the HTML it returns to us
3187 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3189 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3192 'id': shortened_video_id,
3196 # 'uploader_date': uploader_date,
3197 'description': description,
3201 class JustinTVIE(InfoExtractor):
3202 """Information extractor for justin.tv and twitch.tv"""
3203 # TODO: One broadcast may be split into multiple videos. The key
3204 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3205 # starts at 1 and increases. Can we treat all parts as one video?
3207 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3209 (?P<channelid>[^/]+)|
3210 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3211 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3215 _JUSTIN_PAGE_LIMIT = 100
3216 IE_NAME = u'justin.tv'
3218 def report_download_page(self, channel, offset):
3219 """Report attempt to download a single page of videos."""
3220 self.to_screen(u'%s: Downloading video information from %d to %d' %
3221 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3223 # Return count of items, list of *valid* items
3224 def _parse_page(self, url, video_id):
3225 webpage = self._download_webpage(url, video_id,
3226 u'Downloading video info JSON',
3227 u'unable to download video info JSON')
3229 response = json.loads(webpage)
3230 if type(response) != list:
3231 error_text = response.get('error', 'unknown error')
3232 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3234 for clip in response:
3235 video_url = clip['video_file_url']
3237 video_extension = os.path.splitext(video_url)[1][1:]
3238 video_date = re.sub('-', '', clip['start_time'][:10])
3239 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3240 video_id = clip['id']
3241 video_title = clip.get('title', video_id)
3245 'title': video_title,
3246 'uploader': clip.get('channel_name', video_uploader_id),
3247 'uploader_id': video_uploader_id,
3248 'upload_date': video_date,
3249 'ext': video_extension,
3251 return (len(response), info)
3253 def _real_extract(self, url):
3254 mobj = re.match(self._VALID_URL, url)
3256 raise ExtractorError(u'invalid URL: %s' % url)
3258 api_base = 'http://api.justin.tv'
3260 if mobj.group('channelid'):
3262 video_id = mobj.group('channelid')
3263 api = api_base + '/channel/archives/%s.json' % video_id
3264 elif mobj.group('chapterid'):
3265 chapter_id = mobj.group('chapterid')
3267 webpage = self._download_webpage(url, chapter_id)
3268 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3270 raise ExtractorError(u'Cannot find archive of a chapter')
3271 archive_id = m.group(1)
3273 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3274 chapter_info_xml = self._download_webpage(api, chapter_id,
3275 note=u'Downloading chapter information',
3276 errnote=u'Chapter information download failed')
3277 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3278 for a in doc.findall('.//archive'):
3279 if archive_id == a.find('./id').text:
3282 raise ExtractorError(u'Could not find chapter in chapter information')
3284 video_url = a.find('./video_file_url').text
3285 video_ext = video_url.rpartition('.')[2] or u'flv'
3287 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3288 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3289 note='Downloading chapter metadata',
3290 errnote='Download of chapter metadata failed')
3291 chapter_info = json.loads(chapter_info_json)
3293 bracket_start = int(doc.find('.//bracket_start').text)
3294 bracket_end = int(doc.find('.//bracket_end').text)
3296 # TODO determine start (and probably fix up file)
3297 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3298 #video_url += u'?start=' + TODO:start_timestamp
3299 # bracket_start is 13290, but we want 51670615
3300 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3301 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3304 'id': u'c' + chapter_id,
3307 'title': chapter_info['title'],
3308 'thumbnail': chapter_info['preview'],
3309 'description': chapter_info['description'],
3310 'uploader': chapter_info['channel']['display_name'],
3311 'uploader_id': chapter_info['channel']['name'],
3315 video_id = mobj.group('videoid')
3316 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3318 self.report_extraction(video_id)
3322 limit = self._JUSTIN_PAGE_LIMIT
3325 self.report_download_page(video_id, offset)
3326 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3327 page_count, page_info = self._parse_page(page_url, video_id)
3328 info.extend(page_info)
3329 if not paged or page_count != limit:
3334 class FunnyOrDieIE(InfoExtractor):
3335 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3337 def _real_extract(self, url):
3338 mobj = re.match(self._VALID_URL, url)
3340 raise ExtractorError(u'invalid URL: %s' % url)
3342 video_id = mobj.group('id')
3343 webpage = self._download_webpage(url, video_id)
3345 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3346 webpage, u'video URL', flags=re.DOTALL)
3348 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3349 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3351 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3352 webpage, u'description', fatal=False, flags=re.DOTALL)
3359 'description': video_description,
3363 class SteamIE(InfoExtractor):
3364 _VALID_URL = r"""http://store\.steampowered\.com/
3366 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3368 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3372 def suitable(cls, url):
3373 """Receives a URL and returns True if suitable for this IE."""
3374 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3376 def _real_extract(self, url):
3377 m = re.match(self._VALID_URL, url, re.VERBOSE)
3378 gameID = m.group('gameID')
3379 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3380 self.report_age_confirmation()
3381 webpage = self._download_webpage(videourl, gameID)
3382 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3384 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3385 mweb = re.finditer(urlRE, webpage)
3386 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3387 titles = re.finditer(namesRE, webpage)
3388 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3389 thumbs = re.finditer(thumbsRE, webpage)
3391 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3392 video_id = vid.group('videoID')
3393 title = vtitle.group('videoName')
3394 video_url = vid.group('videoURL')
3395 video_thumb = thumb.group('thumbnail')
3397 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3402 'title': unescapeHTML(title),
3403 'thumbnail': video_thumb
3406 return [self.playlist_result(videos, gameID, game_title)]
3408 class UstreamIE(InfoExtractor):
3409 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3410 IE_NAME = u'ustream'
3412 def _real_extract(self, url):
3413 m = re.match(self._VALID_URL, url)
3414 video_id = m.group('videoID')
3416 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3417 webpage = self._download_webpage(url, video_id)
3419 self.report_extraction(video_id)
3421 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3424 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3425 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3427 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3428 webpage, u'thumbnail', fatal=False)
3434 'title': video_title,
3435 'uploader': uploader,
3436 'thumbnail': thumbnail,
3440 class WorldStarHipHopIE(InfoExtractor):
3441 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3442 IE_NAME = u'WorldStarHipHop'
3444 def _real_extract(self, url):
3445 m = re.match(self._VALID_URL, url)
3446 video_id = m.group('id')
3448 webpage_src = self._download_webpage(url, video_id)
3450 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3451 webpage_src, u'video URL')
3453 if 'mp4' in video_url:
3458 video_title = self._html_search_regex(r"<title>(.*)</title>",
3459 webpage_src, u'title')
3461 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3462 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3463 webpage_src, u'thumbnail', fatal=False)
3466 _title = r"""candytitles.*>(.*)</span>"""
3467 mobj = re.search(_title, webpage_src)
3468 if mobj is not None:
3469 video_title = mobj.group(1)
3474 'title' : video_title,
3475 'thumbnail' : thumbnail,
3480 class RBMARadioIE(InfoExtractor):
3481 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3483 def _real_extract(self, url):
3484 m = re.match(self._VALID_URL, url)
3485 video_id = m.group('videoID')
3487 webpage = self._download_webpage(url, video_id)
3489 json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
3490 webpage, u'json data')
3493 data = json.loads(json_data)
3494 except ValueError as e:
3495 raise ExtractorError(u'Invalid JSON: ' + str(e))
3497 video_url = data['akamai_url'] + '&cbr=256'
3498 url_parts = compat_urllib_parse_urlparse(video_url)
3499 video_ext = url_parts.path.rpartition('.')[2]
3504 'title': data['title'],
3505 'description': data.get('teaser_text'),
3506 'location': data.get('country_of_origin'),
3507 'uploader': data.get('host', {}).get('name'),
3508 'uploader_id': data.get('host', {}).get('slug'),
3509 'thumbnail': data.get('image', {}).get('large_url_2x'),
3510 'duration': data.get('duration'),
3515 class YouPornIE(InfoExtractor):
3516 """Information extractor for youporn.com."""
3517 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3519 def _print_formats(self, formats):
3520 """Print all available formats"""
3521 print(u'Available formats:')
3522 print(u'ext\t\tformat')
3523 print(u'---------------------------------')
3524 for format in formats:
3525 print(u'%s\t\t%s' % (format['ext'], format['format']))
3527 def _specific(self, req_format, formats):
3529 if(x["format"]==req_format):
3533 def _real_extract(self, url):
3534 mobj = re.match(self._VALID_URL, url)
3536 raise ExtractorError(u'Invalid URL: %s' % url)
3537 video_id = mobj.group('videoid')
3539 req = compat_urllib_request.Request(url)
3540 req.add_header('Cookie', 'age_verified=1')
3541 webpage = self._download_webpage(req, video_id)
3543 # Get JSON parameters
3544 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3546 params = json.loads(json_params)
3548 raise ExtractorError(u'Invalid JSON')
3550 self.report_extraction(video_id)
3552 video_title = params['title']
3553 upload_date = unified_strdate(params['release_date_f'])
3554 video_description = params['description']
3555 video_uploader = params['submitted_by']
3556 thumbnail = params['thumbnails'][0]['image']
3558 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3560 # Get all of the formats available
3561 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3562 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3563 webpage, u'download list').strip()
3565 # Get all of the links from the page
3566 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3567 links = re.findall(LINK_RE, download_list_html)
3568 if(len(links) == 0):
3569 raise ExtractorError(u'ERROR: no known formats available for video')
3571 self.to_screen(u'Links found: %d' % len(links))
3576 # A link looks like this:
3577 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3578 # A path looks like this:
3579 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3580 video_url = unescapeHTML( link )
3581 path = compat_urllib_parse_urlparse( video_url ).path
3582 extension = os.path.splitext( path )[1][1:]
3583 format = path.split('/')[4].split('_')[:2]
3586 format = "-".join( format )
3587 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3592 'uploader': video_uploader,
3593 'upload_date': upload_date,
3594 'title': video_title,
3597 'thumbnail': thumbnail,
3598 'description': video_description
3601 if self._downloader.params.get('listformats', None):
3602 self._print_formats(formats)
3605 req_format = self._downloader.params.get('format', None)
3606 self.to_screen(u'Format: %s' % req_format)
3608 if req_format is None or req_format == 'best':
3610 elif req_format == 'worst':
3611 return [formats[-1]]
3612 elif req_format in ('-1', 'all'):
3615 format = self._specific( req_format, formats )
3617 raise ExtractorError(u'Requested format not available')
3622 class PornotubeIE(InfoExtractor):
3623 """Information extractor for pornotube.com."""
3624 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3626 def _real_extract(self, url):
3627 mobj = re.match(self._VALID_URL, url)
3629 raise ExtractorError(u'Invalid URL: %s' % url)
3631 video_id = mobj.group('videoid')
3632 video_title = mobj.group('title')
3634 # Get webpage content
3635 webpage = self._download_webpage(url, video_id)
3638 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3639 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3640 video_url = compat_urllib_parse.unquote(video_url)
3642 #Get the uploaded date
3643 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3644 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3645 if upload_date: upload_date = unified_strdate(upload_date)
3647 info = {'id': video_id,
3650 'upload_date': upload_date,
3651 'title': video_title,
3657 class YouJizzIE(InfoExtractor):
3658 """Information extractor for youjizz.com."""
3659 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3661 def _real_extract(self, url):
3662 mobj = re.match(self._VALID_URL, url)
3664 raise ExtractorError(u'Invalid URL: %s' % url)
3666 video_id = mobj.group('videoid')
3668 # Get webpage content
3669 webpage = self._download_webpage(url, video_id)
3671 # Get the video title
3672 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3673 webpage, u'title').strip()
3675 # Get the embed page
3676 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3678 raise ExtractorError(u'ERROR: unable to extract embed page')
3680 embed_page_url = result.group(0).strip()
3681 video_id = result.group('videoid')
3683 webpage = self._download_webpage(embed_page_url, video_id)
3686 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3687 webpage, u'video URL')
3689 info = {'id': video_id,
3691 'title': video_title,
3694 'player_url': embed_page_url}
3698 class EightTracksIE(InfoExtractor):
3700 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3702 def _real_extract(self, url):
3703 mobj = re.match(self._VALID_URL, url)
3705 raise ExtractorError(u'Invalid URL: %s' % url)
3706 playlist_id = mobj.group('id')
3708 webpage = self._download_webpage(url, playlist_id)
3710 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3711 data = json.loads(json_like)
3713 session = str(random.randint(0, 1000000000))
3715 track_count = data['tracks_count']
3716 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3717 next_url = first_url
3719 for i in itertools.count():
3720 api_json = self._download_webpage(next_url, playlist_id,
3721 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3722 errnote=u'Failed to download song information')
3723 api_data = json.loads(api_json)
3724 track_data = api_data[u'set']['track']
3726 'id': track_data['id'],
3727 'url': track_data['track_file_stream_url'],
3728 'title': track_data['performer'] + u' - ' + track_data['name'],
3729 'raw_title': track_data['name'],
3730 'uploader_id': data['user']['login'],
3734 if api_data['set']['at_last_track']:
3736 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3739 class KeekIE(InfoExtractor):
3740 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3743 def _real_extract(self, url):
3744 m = re.match(self._VALID_URL, url)
3745 video_id = m.group('videoID')
3747 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3748 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3749 webpage = self._download_webpage(url, video_id)
3751 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3754 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3755 webpage, u'uploader', fatal=False)
3761 'title': video_title,
3762 'thumbnail': thumbnail,
3763 'uploader': uploader
3767 class TEDIE(InfoExtractor):
3768 _VALID_URL=r'''http://www\.ted\.com/
3770 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3772 ((?P<type_talk>talks)) # We have a simple talk
3774 (/lang/(.*?))? # The url may contain the language
3775 /(?P<name>\w+) # Here goes the name and then ".html"
3779 def suitable(cls, url):
3780 """Receives a URL and returns True if suitable for this IE."""
3781 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3783 def _real_extract(self, url):
3784 m=re.match(self._VALID_URL, url, re.VERBOSE)
3785 if m.group('type_talk'):
3786 return [self._talk_info(url)]
3788 playlist_id=m.group('playlist_id')
3789 name=m.group('name')
3790 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3791 return [self._playlist_videos_info(url,name,playlist_id)]
3793 def _talk_video_link(self,mediaSlug):
3794 '''Returns the video link for that mediaSlug'''
3795 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3797 def _playlist_videos_info(self,url,name,playlist_id=0):
3798 '''Returns the videos of the playlist'''
3800 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3801 ([.\s]*?)data-playlist_item_id="(\d+)"
3802 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3804 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3805 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3806 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3807 m_names=re.finditer(video_name_RE,webpage)
3809 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3810 m_playlist = re.search(playlist_RE, webpage)
3811 playlist_title = m_playlist.group('playlist_title')
3813 playlist_entries = []
3814 for m_video, m_name in zip(m_videos,m_names):
3815 video_id=m_video.group('video_id')
3816 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3817 playlist_entries.append(self.url_result(talk_url, 'TED'))
3818 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3820 def _talk_info(self, url, video_id=0):
3821 """Return the video for the talk in the url"""
3822 m=re.match(self._VALID_URL, url,re.VERBOSE)
3823 videoName=m.group('name')
3824 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3825 # If the url includes the language we get the title translated
3826 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3827 title=re.search(title_RE, webpage).group('title')
3828 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3829 "id":(?P<videoID>[\d]+).*?
3830 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3831 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3832 thumb_match=re.search(thumb_RE,webpage)
3833 info_match=re.search(info_RE,webpage,re.VERBOSE)
3834 video_id=info_match.group('videoID')
3835 mediaSlug=info_match.group('mediaSlug')
3836 video_url=self._talk_video_link(mediaSlug)
3842 'thumbnail': thumb_match.group('thumbnail')
3846 class MySpassIE(InfoExtractor):
3847 _VALID_URL = r'http://www.myspass.de/.*'
3849 def _real_extract(self, url):
3850 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3852 # video id is the last path element of the URL
3853 # usually there is a trailing slash, so also try the second but last
3854 url_path = compat_urllib_parse_urlparse(url).path
3855 url_parent_path, video_id = os.path.split(url_path)
3857 _, video_id = os.path.split(url_parent_path)
3860 metadata_url = META_DATA_URL_TEMPLATE % video_id
3861 metadata_text = self._download_webpage(metadata_url, video_id)
3862 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3864 # extract values from metadata
3865 url_flv_el = metadata.find('url_flv')
3866 if url_flv_el is None:
3867 raise ExtractorError(u'Unable to extract download url')
3868 video_url = url_flv_el.text
3869 extension = os.path.splitext(video_url)[1][1:]
3870 title_el = metadata.find('title')
3871 if title_el is None:
3872 raise ExtractorError(u'Unable to extract title')
3873 title = title_el.text
3874 format_id_el = metadata.find('format_id')
3875 if format_id_el is None:
3878 format = format_id_el.text
3879 description_el = metadata.find('description')
3880 if description_el is not None:
3881 description = description_el.text
3884 imagePreview_el = metadata.find('imagePreview')
3885 if imagePreview_el is not None:
3886 thumbnail = imagePreview_el.text
3895 'thumbnail': thumbnail,
3896 'description': description
3900 class SpiegelIE(InfoExtractor):
3901 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3903 def _real_extract(self, url):
3904 m = re.match(self._VALID_URL, url)
3905 video_id = m.group('videoID')
3907 webpage = self._download_webpage(url, video_id)
3909 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3912 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3913 xml_code = self._download_webpage(xml_url, video_id,
3914 note=u'Downloading XML', errnote=u'Failed to download XML')
3916 idoc = xml.etree.ElementTree.fromstring(xml_code)
3917 last_type = idoc[-1]
3918 filename = last_type.findall('./filename')[0].text
3919 duration = float(last_type.findall('./duration')[0].text)
3921 video_url = 'http://video2.spiegel.de/flash/' + filename
3922 video_ext = filename.rpartition('.')[2]
3927 'title': video_title,
3928 'duration': duration,
3932 class LiveLeakIE(InfoExtractor):
3934 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3935 IE_NAME = u'liveleak'
3937 def _real_extract(self, url):
3938 mobj = re.match(self._VALID_URL, url)
3940 raise ExtractorError(u'Invalid URL: %s' % url)
3942 video_id = mobj.group('video_id')
3944 webpage = self._download_webpage(url, video_id)
3946 video_url = self._search_regex(r'file: "(.*?)",',
3947 webpage, u'video URL')
3949 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3950 webpage, u'title').replace('LiveLeak.com -', '').strip()
3952 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3953 webpage, u'description', fatal=False)
3955 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3956 webpage, u'uploader', fatal=False)
3962 'title': video_title,
3963 'description': video_description,
3964 'uploader': video_uploader
3969 class ARDIE(InfoExtractor):
3970 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3971 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3972 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3974 def _real_extract(self, url):
3975 # determine video id from url
3976 m = re.match(self._VALID_URL, url)
3978 numid = re.search(r'documentId=([0-9]+)', url)
3980 video_id = numid.group(1)
3982 video_id = m.group('video_id')
3984 # determine title and media streams from webpage
3985 html = self._download_webpage(url, video_id)
3986 title = re.search(self._TITLE, html).group('title')
3987 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3989 assert '"fsk"' in html
3990 raise ExtractorError(u'This video is only available after 8:00 pm')
3992 # choose default media type and highest quality for now
3993 stream = max([s for s in streams if int(s["media_type"]) == 0],
3994 key=lambda s: int(s["quality"]))
3996 # there's two possibilities: RTMP stream or HTTP download
3997 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3998 if stream['rtmp_url']:
3999 self.to_screen(u'RTMP download detected')
4000 assert stream['video_url'].startswith('mp4:')
4001 info["url"] = stream["rtmp_url"]
4002 info["play_path"] = stream['video_url']
4004 assert stream["video_url"].endswith('.mp4')
4005 info["url"] = stream["video_url"]
4008 class TumblrIE(InfoExtractor):
4009 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4011 def _real_extract(self, url):
4012 m_url = re.match(self._VALID_URL, url)
4013 video_id = m_url.group('id')
4014 blog = m_url.group('blog_name')
4016 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4017 webpage = self._download_webpage(url, video_id)
4019 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4020 video = re.search(re_video, webpage)
4022 raise ExtractorError(u'Unable to extract video')
4023 video_url = video.group('video_url')
4024 ext = video.group('ext')
4026 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4027 webpage, u'thumbnail', fatal=False) # We pick the first poster
4028 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4030 # The only place where you can get a title, it's not complete,
4031 # but searching in other places doesn't work for all videos
4032 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4033 webpage, u'title', flags=re.DOTALL)
4035 return [{'id': video_id,
4037 'title': video_title,
4038 'thumbnail': video_thumbnail,
4042 class BandcampIE(InfoExtractor):
4043 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4045 def _real_extract(self, url):
4046 mobj = re.match(self._VALID_URL, url)
4047 title = mobj.group('title')
4048 webpage = self._download_webpage(url, title)
4049 # We get the link to the free download page
4050 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4051 if m_download is None:
4052 raise ExtractorError(u'No free songs found')
4054 download_link = m_download.group(1)
4055 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4056 webpage, re.MULTILINE|re.DOTALL).group('id')
4058 download_webpage = self._download_webpage(download_link, id,
4059 'Downloading free downloads page')
4060 # We get the dictionary of the track from some javascrip code
4061 info = re.search(r'items: (.*?),$',
4062 download_webpage, re.MULTILINE).group(1)
4063 info = json.loads(info)[0]
4064 # We pick mp3-320 for now, until format selection can be easily implemented.
4065 mp3_info = info[u'downloads'][u'mp3-320']
4066 # If we try to use this url it says the link has expired
4067 initial_url = mp3_info[u'url']
4068 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4069 m_url = re.match(re_url, initial_url)
4070 #We build the url we will use to get the final track url
4071 # This url is build in Bandcamp in the script download_bunde_*.js
4072 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4073 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4074 # If we could correctly generate the .rand field the url would be
4075 #in the "download_url" key
4076 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4078 track_info = {'id':id,
4079 'title' : info[u'title'],
4082 'thumbnail' : info[u'thumb_url'],
4083 'uploader' : info[u'artist']
4088 class RedTubeIE(InfoExtractor):
4089 """Information Extractor for redtube"""
4090 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4092 def _real_extract(self,url):
4093 mobj = re.match(self._VALID_URL, url)
4095 raise ExtractorError(u'Invalid URL: %s' % url)
4097 video_id = mobj.group('id')
4098 video_extension = 'mp4'
4099 webpage = self._download_webpage(url, video_id)
4101 self.report_extraction(video_id)
4103 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4104 webpage, u'video URL')
4106 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4112 'ext': video_extension,
4113 'title': video_title,
4116 class InaIE(InfoExtractor):
4117 """Information Extractor for Ina.fr"""
4118 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4120 def _real_extract(self,url):
4121 mobj = re.match(self._VALID_URL, url)
4123 video_id = mobj.group('id')
4124 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4125 video_extension = 'mp4'
4126 webpage = self._download_webpage(mrss_url, video_id)
4128 self.report_extraction(video_id)
4130 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4131 webpage, u'video URL')
4133 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4139 'ext': video_extension,
4140 'title': video_title,
4143 class HowcastIE(InfoExtractor):
4144 """Information Extractor for Howcast.com"""
4145 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4147 def _real_extract(self, url):
4148 mobj = re.match(self._VALID_URL, url)
4150 video_id = mobj.group('id')
4151 webpage_url = 'http://www.howcast.com/videos/' + video_id
4152 webpage = self._download_webpage(webpage_url, video_id)
4154 self.report_extraction(video_id)
4156 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4157 webpage, u'video URL')
4159 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4162 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4163 webpage, u'description', fatal=False)
4165 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4166 webpage, u'thumbnail', fatal=False)
4172 'title': video_title,
4173 'description': video_description,
4174 'thumbnail': thumbnail,
4177 class VineIE(InfoExtractor):
4178 """Information Extractor for Vine.co"""
4179 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4181 def _real_extract(self, url):
4182 mobj = re.match(self._VALID_URL, url)
4184 video_id = mobj.group('id')
4185 webpage_url = 'https://vine.co/v/' + video_id
4186 webpage = self._download_webpage(webpage_url, video_id)
4188 self.report_extraction(video_id)
4190 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4191 webpage, u'video URL')
4193 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4196 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4197 webpage, u'thumbnail', fatal=False)
4199 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4200 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4206 'title': video_title,
4207 'thumbnail': thumbnail,
4208 'uploader': uploader,
4211 class FlickrIE(InfoExtractor):
4212 """Information Extractor for Flickr videos"""
4213 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4215 def _real_extract(self, url):
4216 mobj = re.match(self._VALID_URL, url)
4218 video_id = mobj.group('id')
4219 video_uploader_id = mobj.group('uploader_id')
4220 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4221 webpage = self._download_webpage(webpage_url, video_id)
4223 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4225 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4226 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4228 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4229 first_xml, u'node_id')
4231 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4232 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4234 self.report_extraction(video_id)
4236 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4238 raise ExtractorError(u'Unable to extract video url')
4239 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4241 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4242 webpage, u'video title')
4244 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4245 webpage, u'description', fatal=False)
4247 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4248 webpage, u'thumbnail', fatal=False)
4254 'title': video_title,
4255 'description': video_description,
4256 'thumbnail': thumbnail,
4257 'uploader_id': video_uploader_id,
4260 class TeamcocoIE(InfoExtractor):
4261 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4263 def _real_extract(self, url):
4264 mobj = re.match(self._VALID_URL, url)
4266 raise ExtractorError(u'Invalid URL: %s' % url)
4267 url_title = mobj.group('url_title')
4268 webpage = self._download_webpage(url, url_title)
4270 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4271 webpage, u'video id')
4273 self.report_extraction(video_id)
4275 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4278 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4279 webpage, u'thumbnail', fatal=False)
4281 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4282 webpage, u'description', fatal=False)
4284 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4285 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4287 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4294 'title': video_title,
4295 'thumbnail': thumbnail,
4296 'description': video_description,
4299 class XHamsterIE(InfoExtractor):
4300 """Information Extractor for xHamster"""
4301 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4303 def _real_extract(self,url):
4304 mobj = re.match(self._VALID_URL, url)
4306 video_id = mobj.group('id')
4307 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4308 webpage = self._download_webpage(mrss_url, video_id)
4310 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4312 raise ExtractorError(u'Unable to extract media URL')
4313 if len(mobj.group('server')) == 0:
4314 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4316 video_url = mobj.group('server')+'/key='+mobj.group('file')
4317 video_extension = video_url.split('.')[-1]
4319 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4322 # Can't see the description anywhere in the UI
4323 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4324 # webpage, u'description', fatal=False)
4325 # if video_description: video_description = unescapeHTML(video_description)
4327 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4329 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4331 video_upload_date = None
4332 self._downloader.report_warning(u'Unable to extract upload date')
4334 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4335 webpage, u'uploader id', default=u'anonymous')
4337 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4338 webpage, u'thumbnail', fatal=False)
4343 'ext': video_extension,
4344 'title': video_title,
4345 # 'description': video_description,
4346 'upload_date': video_upload_date,
4347 'uploader_id': video_uploader_id,
4348 'thumbnail': video_thumbnail
4351 class HypemIE(InfoExtractor):
4352 """Information Extractor for hypem"""
4353 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4355 def _real_extract(self, url):
4356 mobj = re.match(self._VALID_URL, url)
4358 raise ExtractorError(u'Invalid URL: %s' % url)
4359 track_id = mobj.group(1)
4361 data = { 'ax': 1, 'ts': time.time() }
4362 data_encoded = compat_urllib_parse.urlencode(data)
4363 complete_url = url + "?" + data_encoded
4364 request = compat_urllib_request.Request(complete_url)
4365 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4366 cookie = urlh.headers.get('Set-Cookie', '')
4368 self.report_extraction(track_id)
4370 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4371 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4373 track_list = json.loads(html_tracks)
4374 track = track_list[u'tracks'][0]
4376 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4379 track_id = track[u"id"]
4380 artist = track[u"artist"]
4381 title = track[u"song"]
4383 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4384 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4385 request.add_header('cookie', cookie)
4386 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4388 song_data = json.loads(song_data_json)
4390 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4391 final_url = song_data[u"url"]
4402 def gen_extractors():
4403 """ Return a list of an instance of every supported extractor.
4404 The order does matter; the first extractor matched is the one handling the URL.
4407 YoutubePlaylistIE(),
4432 StanfordOpenClassroomIE(),
4442 WorldStarHipHopIE(),
4468 def get_info_extractor(ie_name):
4469 """Returns the info extractor class with the given ie_name"""
4470 return globals()[ie_name+'IE']